{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss": 2.321008, "epoch": 1e-05, "grad_norm": 0.004705373197793961, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.115723, "key_mse_loss_layer_016": 0.111328, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.120117, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.10791, "key_mse_loss_layer_021": 0.102539, "key_mse_loss_layer_022": 0.108887, "key_mse_loss_layer_023": 0.10791, "key_mse_loss_layer_024": 0.087402, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.09668, "key_mse_loss_layer_027": 0.095215, "key_mse_loss_layer_028": 0.101074, "key_mse_loss_layer_029": 0.09082, "key_mse_loss_layer_030": 0.100586, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.160339, "kv_vq_loss": 0.008491, "learning_rate": 0.00024999999999999995, "loss": 0.168823, "step": 10, "value_mse_loss_layer_000": 0.002319, "value_mse_loss_layer_001": 0.010559, "value_mse_loss_layer_002": 0.050537, "value_mse_loss_layer_003": 0.080566, "value_mse_loss_layer_004": 0.074219, "value_mse_loss_layer_005": 0.073242, "value_mse_loss_layer_006": 0.083496, "value_mse_loss_layer_007": 0.101074, "value_mse_loss_layer_008": 0.107422, "value_mse_loss_layer_009": 0.141602, "value_mse_loss_layer_010": 0.132812, "value_mse_loss_layer_011": 0.146484, "value_mse_loss_layer_012": 0.148438, "value_mse_loss_layer_013": 0.15332, "value_mse_loss_layer_014": 0.163086, "value_mse_loss_layer_015": 0.135742, "value_mse_loss_layer_016": 0.146484, "value_mse_loss_layer_017": 0.137695, "value_mse_loss_layer_018": 0.173828, "value_mse_loss_layer_019": 0.189453, "value_mse_loss_layer_020": 0.188477, "value_mse_loss_layer_021": 0.208984, "value_mse_loss_layer_022": 0.213867, "value_mse_loss_layer_023": 0.246094, "value_mse_loss_layer_024": 0.298828, "value_mse_loss_layer_025": 0.347656, "value_mse_loss_layer_026": 0.257812, "value_mse_loss_layer_027": 0.414062, "value_mse_loss_layer_028": 0.357422, "value_mse_loss_layer_029": 0.515625, "value_mse_loss_layer_030": 0.539062, "value_mse_loss_layer_031": 0.660156, "vq_loss_layer_000": 6.6e-05, "vq_loss_layer_001": 0.000153, "vq_loss_layer_002": 0.00028, "vq_loss_layer_003": 0.000748, "vq_loss_layer_004": 0.001137, "vq_loss_layer_005": 0.001343, "vq_loss_layer_006": 0.00193, "vq_loss_layer_007": 0.003128, "vq_loss_layer_008": 0.003189, "vq_loss_layer_009": 0.003723, "vq_loss_layer_010": 0.003799, "vq_loss_layer_011": 0.004089, "vq_loss_layer_012": 0.00589, "vq_loss_layer_013": 0.005249, "vq_loss_layer_014": 0.005981, "vq_loss_layer_015": 0.004333, "vq_loss_layer_016": 0.005219, "vq_loss_layer_017": 0.003662, "vq_loss_layer_018": 0.003296, "vq_loss_layer_019": 0.002426, "vq_loss_layer_020": 0.002426, "vq_loss_layer_021": 0.004791, "vq_loss_layer_022": 0.003174, "vq_loss_layer_023": 0.003784, "vq_loss_layer_024": 0.004181, "vq_loss_layer_025": 0.005554, "vq_loss_layer_026": 0.007385, "vq_loss_layer_027": 0.009338, "vq_loss_layer_028": 0.010681, "vq_loss_layer_029": 0.014038, "vq_loss_layer_030": 0.028931, "vq_loss_layer_031": 0.080566 }, { "ce_loss": 2.320547, "epoch": 2e-05, "grad_norm": 0.00174082035664469, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.140918, "kv_vq_loss": 0.006618, "learning_rate": 0.0003252574989159953, "loss": 0.147546, "step": 20, "value_mse_loss_layer_000": 0.002304, "value_mse_loss_layer_001": 0.010254, "value_mse_loss_layer_002": 0.049316, "value_mse_loss_layer_003": 0.079102, "value_mse_loss_layer_004": 0.074219, "value_mse_loss_layer_005": 0.067871, "value_mse_loss_layer_006": 0.077637, "value_mse_loss_layer_007": 0.089844, "value_mse_loss_layer_008": 0.095215, "value_mse_loss_layer_009": 0.130859, "value_mse_loss_layer_010": 0.119629, "value_mse_loss_layer_011": 0.130859, "value_mse_loss_layer_012": 0.135742, "value_mse_loss_layer_013": 0.136719, "value_mse_loss_layer_014": 0.148438, "value_mse_loss_layer_015": 0.134766, "value_mse_loss_layer_016": 0.149414, "value_mse_loss_layer_017": 0.135742, "value_mse_loss_layer_018": 0.168945, "value_mse_loss_layer_019": 0.189453, "value_mse_loss_layer_020": 0.185547, "value_mse_loss_layer_021": 0.209961, "value_mse_loss_layer_022": 0.201172, "value_mse_loss_layer_023": 0.265625, "value_mse_loss_layer_024": 0.300781, "value_mse_loss_layer_025": 0.408203, "value_mse_loss_layer_026": 0.267578, "value_mse_loss_layer_027": 0.394531, "value_mse_loss_layer_028": 0.349609, "value_mse_loss_layer_029": 0.535156, "value_mse_loss_layer_030": 0.478516, "value_mse_loss_layer_031": 0.515625, "vq_loss_layer_000": 5.4e-05, "vq_loss_layer_001": 6.2e-05, "vq_loss_layer_002": 0.00019, "vq_loss_layer_003": 0.000648, "vq_loss_layer_004": 0.00116, "vq_loss_layer_005": 0.00119, "vq_loss_layer_006": 0.001945, "vq_loss_layer_007": 0.002731, "vq_loss_layer_008": 0.002701, "vq_loss_layer_009": 0.003662, "vq_loss_layer_010": 0.003159, "vq_loss_layer_011": 0.003586, "vq_loss_layer_012": 0.004822, "vq_loss_layer_013": 0.004913, "vq_loss_layer_014": 0.004669, "vq_loss_layer_015": 0.004639, "vq_loss_layer_016": 0.005127, "vq_loss_layer_017": 0.004791, "vq_loss_layer_018": 0.003662, "vq_loss_layer_019": 0.00267, "vq_loss_layer_020": 0.002991, "vq_loss_layer_021": 0.005219, "vq_loss_layer_022": 0.003143, "vq_loss_layer_023": 0.004974, "vq_loss_layer_024": 0.004242, "vq_loss_layer_025": 0.006256, "vq_loss_layer_026": 0.007812, "vq_loss_layer_027": 0.007538, "vq_loss_layer_028": 0.009644, "vq_loss_layer_029": 0.015869, "vq_loss_layer_030": 0.024902, "vq_loss_layer_031": 0.044434 }, { "ce_loss": 2.278703, "epoch": 3e-05, "grad_norm": 0.0012153700226917863, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.13089, "kv_vq_loss": 0.005801, "learning_rate": 0.0003692803136799155, "loss": 0.136755, "step": 30, "value_mse_loss_layer_000": 0.002304, "value_mse_loss_layer_001": 0.009644, "value_mse_loss_layer_002": 0.046143, "value_mse_loss_layer_003": 0.074707, "value_mse_loss_layer_004": 0.066895, "value_mse_loss_layer_005": 0.061523, "value_mse_loss_layer_006": 0.070312, "value_mse_loss_layer_007": 0.07959, "value_mse_loss_layer_008": 0.085938, "value_mse_loss_layer_009": 0.117676, "value_mse_loss_layer_010": 0.106934, "value_mse_loss_layer_011": 0.11084, "value_mse_loss_layer_012": 0.119629, "value_mse_loss_layer_013": 0.118164, "value_mse_loss_layer_014": 0.133789, "value_mse_loss_layer_015": 0.125, "value_mse_loss_layer_016": 0.136719, "value_mse_loss_layer_017": 0.128906, "value_mse_loss_layer_018": 0.155273, "value_mse_loss_layer_019": 0.169922, "value_mse_loss_layer_020": 0.173828, "value_mse_loss_layer_021": 0.203125, "value_mse_loss_layer_022": 0.189453, "value_mse_loss_layer_023": 0.25, "value_mse_loss_layer_024": 0.28125, "value_mse_loss_layer_025": 0.355469, "value_mse_loss_layer_026": 0.249023, "value_mse_loss_layer_027": 0.337891, "value_mse_loss_layer_028": 0.332031, "value_mse_loss_layer_029": 0.484375, "value_mse_loss_layer_030": 0.412109, "value_mse_loss_layer_031": 0.447266, "vq_loss_layer_000": 4.9e-05, "vq_loss_layer_001": 6.9e-05, "vq_loss_layer_002": 0.000153, "vq_loss_layer_003": 0.000626, "vq_loss_layer_004": 0.00103, "vq_loss_layer_005": 0.001076, "vq_loss_layer_006": 0.001709, "vq_loss_layer_007": 0.00235, "vq_loss_layer_008": 0.002106, "vq_loss_layer_009": 0.003036, "vq_loss_layer_010": 0.002701, "vq_loss_layer_011": 0.002655, "vq_loss_layer_012": 0.004089, "vq_loss_layer_013": 0.003876, "vq_loss_layer_014": 0.003967, "vq_loss_layer_015": 0.004211, "vq_loss_layer_016": 0.004303, "vq_loss_layer_017": 0.003967, "vq_loss_layer_018": 0.002686, "vq_loss_layer_019": 0.001984, "vq_loss_layer_020": 0.002609, "vq_loss_layer_021": 0.004059, "vq_loss_layer_022": 0.002686, "vq_loss_layer_023": 0.003281, "vq_loss_layer_024": 0.003098, "vq_loss_layer_025": 0.004425, "vq_loss_layer_026": 0.005798, "vq_loss_layer_027": 0.006256, "vq_loss_layer_028": 0.008728, "vq_loss_layer_029": 0.012451, "vq_loss_layer_030": 0.02124, "vq_loss_layer_031": 0.034424 }, { "ce_loss": 2.272556, "epoch": 4e-05, "grad_norm": 0.0008305184892378747, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.123651, "kv_vq_loss": 0.005219, "learning_rate": 0.0004005149978319905, "loss": 0.12887, "step": 40, "value_mse_loss_layer_000": 0.002274, "value_mse_loss_layer_001": 0.009094, "value_mse_loss_layer_002": 0.041748, "value_mse_loss_layer_003": 0.070312, "value_mse_loss_layer_004": 0.061035, "value_mse_loss_layer_005": 0.056152, "value_mse_loss_layer_006": 0.066406, "value_mse_loss_layer_007": 0.07373, "value_mse_loss_layer_008": 0.080566, "value_mse_loss_layer_009": 0.111328, "value_mse_loss_layer_010": 0.101562, "value_mse_loss_layer_011": 0.102051, "value_mse_loss_layer_012": 0.113281, "value_mse_loss_layer_013": 0.111328, "value_mse_loss_layer_014": 0.125977, "value_mse_loss_layer_015": 0.117676, "value_mse_loss_layer_016": 0.125, "value_mse_loss_layer_017": 0.122559, "value_mse_loss_layer_018": 0.149414, "value_mse_loss_layer_019": 0.154297, "value_mse_loss_layer_020": 0.162109, "value_mse_loss_layer_021": 0.193359, "value_mse_loss_layer_022": 0.174805, "value_mse_loss_layer_023": 0.238281, "value_mse_loss_layer_024": 0.255859, "value_mse_loss_layer_025": 0.333984, "value_mse_loss_layer_026": 0.224609, "value_mse_loss_layer_027": 0.308594, "value_mse_loss_layer_028": 0.304688, "value_mse_loss_layer_029": 0.462891, "value_mse_loss_layer_030": 0.386719, "value_mse_loss_layer_031": 0.408203, "vq_loss_layer_000": 4.6e-05, "vq_loss_layer_001": 5.2e-05, "vq_loss_layer_002": 0.000128, "vq_loss_layer_003": 0.000587, "vq_loss_layer_004": 0.000942, "vq_loss_layer_005": 0.001022, "vq_loss_layer_006": 0.001602, "vq_loss_layer_007": 0.002151, "vq_loss_layer_008": 0.001984, "vq_loss_layer_009": 0.003098, "vq_loss_layer_010": 0.00267, "vq_loss_layer_011": 0.002426, "vq_loss_layer_012": 0.003998, "vq_loss_layer_013": 0.003387, "vq_loss_layer_014": 0.00386, "vq_loss_layer_015": 0.003906, "vq_loss_layer_016": 0.004395, "vq_loss_layer_017": 0.003891, "vq_loss_layer_018": 0.002579, "vq_loss_layer_019": 0.0019, "vq_loss_layer_020": 0.002731, "vq_loss_layer_021": 0.004486, "vq_loss_layer_022": 0.002243, "vq_loss_layer_023": 0.003601, "vq_loss_layer_024": 0.002991, "vq_loss_layer_025": 0.004639, "vq_loss_layer_026": 0.005524, "vq_loss_layer_027": 0.006226, "vq_loss_layer_028": 0.007874, "vq_loss_layer_029": 0.013, "vq_loss_layer_030": 0.019287, "vq_loss_layer_031": 0.028687 }, { "ce_loss": 2.278332, "epoch": 5e-05, "grad_norm": 0.0005490669864229858, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.118408, "kv_vq_loss": 0.004699, "learning_rate": 0.0004247425010840046, "loss": 0.123114, "step": 50, "value_mse_loss_layer_000": 0.002228, "value_mse_loss_layer_001": 0.008606, "value_mse_loss_layer_002": 0.037842, "value_mse_loss_layer_003": 0.064941, "value_mse_loss_layer_004": 0.056885, "value_mse_loss_layer_005": 0.05249, "value_mse_loss_layer_006": 0.062012, "value_mse_loss_layer_007": 0.068359, "value_mse_loss_layer_008": 0.075195, "value_mse_loss_layer_009": 0.104004, "value_mse_loss_layer_010": 0.094238, "value_mse_loss_layer_011": 0.09375, "value_mse_loss_layer_012": 0.105469, "value_mse_loss_layer_013": 0.103027, "value_mse_loss_layer_014": 0.116699, "value_mse_loss_layer_015": 0.11377, "value_mse_loss_layer_016": 0.117188, "value_mse_loss_layer_017": 0.115723, "value_mse_loss_layer_018": 0.140625, "value_mse_loss_layer_019": 0.143555, "value_mse_loss_layer_020": 0.154297, "value_mse_loss_layer_021": 0.18457, "value_mse_loss_layer_022": 0.165039, "value_mse_loss_layer_023": 0.233398, "value_mse_loss_layer_024": 0.251953, "value_mse_loss_layer_025": 0.326172, "value_mse_loss_layer_026": 0.225586, "value_mse_loss_layer_027": 0.304688, "value_mse_loss_layer_028": 0.300781, "value_mse_loss_layer_029": 0.462891, "value_mse_loss_layer_030": 0.369141, "value_mse_loss_layer_031": 0.380859, "vq_loss_layer_000": 4.5e-05, "vq_loss_layer_001": 5.7e-05, "vq_loss_layer_002": 0.000115, "vq_loss_layer_003": 0.000542, "vq_loss_layer_004": 0.000809, "vq_loss_layer_005": 0.000896, "vq_loss_layer_006": 0.00135, "vq_loss_layer_007": 0.001755, "vq_loss_layer_008": 0.001701, "vq_loss_layer_009": 0.00264, "vq_loss_layer_010": 0.002289, "vq_loss_layer_011": 0.002014, "vq_loss_layer_012": 0.003326, "vq_loss_layer_013": 0.002777, "vq_loss_layer_014": 0.003189, "vq_loss_layer_015": 0.003677, "vq_loss_layer_016": 0.004333, "vq_loss_layer_017": 0.003448, "vq_loss_layer_018": 0.002487, "vq_loss_layer_019": 0.001862, "vq_loss_layer_020": 0.002472, "vq_loss_layer_021": 0.004456, "vq_loss_layer_022": 0.001968, "vq_loss_layer_023": 0.003418, "vq_loss_layer_024": 0.00322, "vq_loss_layer_025": 0.004517, "vq_loss_layer_026": 0.005615, "vq_loss_layer_027": 0.006378, "vq_loss_layer_028": 0.007233, "vq_loss_layer_029": 0.012268, "vq_loss_layer_030": 0.016113, "vq_loss_layer_031": 0.028442 }, { "ce_loss": 2.315096, "epoch": 6e-05, "grad_norm": 0.00041623672586865723, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.088867, "key_mse_loss_layer_021": 0.084961, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.07373, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.113501, "kv_vq_loss": 0.004296, "learning_rate": 0.0004445378125959108, "loss": 0.11781, "step": 60, "value_mse_loss_layer_000": 0.002136, "value_mse_loss_layer_001": 0.008179, "value_mse_loss_layer_002": 0.033936, "value_mse_loss_layer_003": 0.061768, "value_mse_loss_layer_004": 0.052734, "value_mse_loss_layer_005": 0.047852, "value_mse_loss_layer_006": 0.057617, "value_mse_loss_layer_007": 0.0625, "value_mse_loss_layer_008": 0.070801, "value_mse_loss_layer_009": 0.099121, "value_mse_loss_layer_010": 0.087402, "value_mse_loss_layer_011": 0.087402, "value_mse_loss_layer_012": 0.095703, "value_mse_loss_layer_013": 0.092773, "value_mse_loss_layer_014": 0.10791, "value_mse_loss_layer_015": 0.106934, "value_mse_loss_layer_016": 0.10791, "value_mse_loss_layer_017": 0.106445, "value_mse_loss_layer_018": 0.132812, "value_mse_loss_layer_019": 0.131836, "value_mse_loss_layer_020": 0.138672, "value_mse_loss_layer_021": 0.168945, "value_mse_loss_layer_022": 0.148438, "value_mse_loss_layer_023": 0.224609, "value_mse_loss_layer_024": 0.240234, "value_mse_loss_layer_025": 0.300781, "value_mse_loss_layer_026": 0.207031, "value_mse_loss_layer_027": 0.277344, "value_mse_loss_layer_028": 0.271484, "value_mse_loss_layer_029": 0.4375, "value_mse_loss_layer_030": 0.332031, "value_mse_loss_layer_031": 0.359375, "vq_loss_layer_000": 4.3e-05, "vq_loss_layer_001": 4.7e-05, "vq_loss_layer_002": 9.3e-05, "vq_loss_layer_003": 0.000463, "vq_loss_layer_004": 0.000725, "vq_loss_layer_005": 0.000702, "vq_loss_layer_006": 0.001144, "vq_loss_layer_007": 0.001518, "vq_loss_layer_008": 0.001495, "vq_loss_layer_009": 0.002304, "vq_loss_layer_010": 0.001984, "vq_loss_layer_011": 0.001785, "vq_loss_layer_012": 0.002884, "vq_loss_layer_013": 0.002304, "vq_loss_layer_014": 0.002884, "vq_loss_layer_015": 0.003204, "vq_loss_layer_016": 0.00415, "vq_loss_layer_017": 0.003143, "vq_loss_layer_018": 0.00235, "vq_loss_layer_019": 0.001808, "vq_loss_layer_020": 0.002411, "vq_loss_layer_021": 0.004333, "vq_loss_layer_022": 0.001656, "vq_loss_layer_023": 0.00386, "vq_loss_layer_024": 0.003326, "vq_loss_layer_025": 0.004272, "vq_loss_layer_026": 0.005157, "vq_loss_layer_027": 0.005341, "vq_loss_layer_028": 0.006348, "vq_loss_layer_029": 0.014221, "vq_loss_layer_030": 0.014832, "vq_loss_layer_031": 0.029053 }, { "ce_loss": 2.308991, "epoch": 7e-05, "grad_norm": 0.0004680850834120065, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.062012, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.092773, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.104492, "key_mse_loss_layer_014": 0.101562, "key_mse_loss_layer_015": 0.092773, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.080566, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.092773, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.109534, "kv_vq_loss": 0.003861, "learning_rate": 0.0004612745100035642, "loss": 0.113397, "step": 70, "value_mse_loss_layer_000": 0.002045, "value_mse_loss_layer_001": 0.00769, "value_mse_loss_layer_002": 0.031494, "value_mse_loss_layer_003": 0.058838, "value_mse_loss_layer_004": 0.054199, "value_mse_loss_layer_005": 0.046631, "value_mse_loss_layer_006": 0.054443, "value_mse_loss_layer_007": 0.060303, "value_mse_loss_layer_008": 0.065918, "value_mse_loss_layer_009": 0.085449, "value_mse_loss_layer_010": 0.076172, "value_mse_loss_layer_011": 0.075684, "value_mse_loss_layer_012": 0.086426, "value_mse_loss_layer_013": 0.081055, "value_mse_loss_layer_014": 0.096191, "value_mse_loss_layer_015": 0.091797, "value_mse_loss_layer_016": 0.100098, "value_mse_loss_layer_017": 0.098145, "value_mse_loss_layer_018": 0.125977, "value_mse_loss_layer_019": 0.124023, "value_mse_loss_layer_020": 0.138672, "value_mse_loss_layer_021": 0.162109, "value_mse_loss_layer_022": 0.145508, "value_mse_loss_layer_023": 0.217773, "value_mse_loss_layer_024": 0.242188, "value_mse_loss_layer_025": 0.308594, "value_mse_loss_layer_026": 0.233398, "value_mse_loss_layer_027": 0.304688, "value_mse_loss_layer_028": 0.306641, "value_mse_loss_layer_029": 0.464844, "value_mse_loss_layer_030": 0.376953, "value_mse_loss_layer_031": 0.396484, "vq_loss_layer_000": 4.6e-05, "vq_loss_layer_001": 0.000149, "vq_loss_layer_002": 0.000144, "vq_loss_layer_003": 0.000523, "vq_loss_layer_004": 0.000729, "vq_loss_layer_005": 0.000732, "vq_loss_layer_006": 0.000999, "vq_loss_layer_007": 0.001328, "vq_loss_layer_008": 0.001472, "vq_loss_layer_009": 0.001892, "vq_loss_layer_010": 0.001816, "vq_loss_layer_011": 0.001511, "vq_loss_layer_012": 0.002319, "vq_loss_layer_013": 0.001984, "vq_loss_layer_014": 0.002426, "vq_loss_layer_015": 0.002731, "vq_loss_layer_016": 0.003708, "vq_loss_layer_017": 0.002289, "vq_loss_layer_018": 0.002319, "vq_loss_layer_019": 0.001953, "vq_loss_layer_020": 0.00161, "vq_loss_layer_021": 0.003662, "vq_loss_layer_022": 0.001572, "vq_loss_layer_023": 0.003098, "vq_loss_layer_024": 0.00322, "vq_loss_layer_025": 0.004517, "vq_loss_layer_026": 0.005554, "vq_loss_layer_027": 0.006287, "vq_loss_layer_028": 0.009888, "vq_loss_layer_029": 0.014587, "vq_loss_layer_030": 0.018555, "vq_loss_layer_031": 0.041748 }, { "ce_loss": 2.317907, "epoch": 8e-05, "grad_norm": 0.000667389715090394, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.105292, "kv_vq_loss": 0.00347, "learning_rate": 0.0004757724967479858, "loss": 0.108771, "step": 80, "value_mse_loss_layer_000": 0.002121, "value_mse_loss_layer_001": 0.007507, "value_mse_loss_layer_002": 0.030151, "value_mse_loss_layer_003": 0.055664, "value_mse_loss_layer_004": 0.048828, "value_mse_loss_layer_005": 0.043945, "value_mse_loss_layer_006": 0.05249, "value_mse_loss_layer_007": 0.057373, "value_mse_loss_layer_008": 0.063965, "value_mse_loss_layer_009": 0.085938, "value_mse_loss_layer_010": 0.078125, "value_mse_loss_layer_011": 0.07666, "value_mse_loss_layer_012": 0.084961, "value_mse_loss_layer_013": 0.083008, "value_mse_loss_layer_014": 0.095215, "value_mse_loss_layer_015": 0.091797, "value_mse_loss_layer_016": 0.095703, "value_mse_loss_layer_017": 0.09668, "value_mse_loss_layer_018": 0.114258, "value_mse_loss_layer_019": 0.115234, "value_mse_loss_layer_020": 0.125977, "value_mse_loss_layer_021": 0.15332, "value_mse_loss_layer_022": 0.122559, "value_mse_loss_layer_023": 0.198242, "value_mse_loss_layer_024": 0.194336, "value_mse_loss_layer_025": 0.277344, "value_mse_loss_layer_026": 0.183594, "value_mse_loss_layer_027": 0.227539, "value_mse_loss_layer_028": 0.236328, "value_mse_loss_layer_029": 0.378906, "value_mse_loss_layer_030": 0.271484, "value_mse_loss_layer_031": 0.335938, "vq_loss_layer_000": 4.2e-05, "vq_loss_layer_001": 4.9e-05, "vq_loss_layer_002": 8.5e-05, "vq_loss_layer_003": 0.000425, "vq_loss_layer_004": 0.000603, "vq_loss_layer_005": 0.000595, "vq_loss_layer_006": 0.0009, "vq_loss_layer_007": 0.001183, "vq_loss_layer_008": 0.001312, "vq_loss_layer_009": 0.001816, "vq_loss_layer_010": 0.001579, "vq_loss_layer_011": 0.001381, "vq_loss_layer_012": 0.002197, "vq_loss_layer_013": 0.001923, "vq_loss_layer_014": 0.002335, "vq_loss_layer_015": 0.002457, "vq_loss_layer_016": 0.003235, "vq_loss_layer_017": 0.002365, "vq_loss_layer_018": 0.002136, "vq_loss_layer_019": 0.001266, "vq_loss_layer_020": 0.0019, "vq_loss_layer_021": 0.00354, "vq_loss_layer_022": 0.00135, "vq_loss_layer_023": 0.003998, "vq_loss_layer_024": 0.002441, "vq_loss_layer_025": 0.004059, "vq_loss_layer_026": 0.004425, "vq_loss_layer_027": 0.004242, "vq_loss_layer_028": 0.004791, "vq_loss_layer_029": 0.009277, "vq_loss_layer_030": 0.010681, "vq_loss_layer_031": 0.022705 }, { "ce_loss": 2.270657, "epoch": 9e-05, "grad_norm": 0.0010078417835757136, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.101697, "kv_vq_loss": 0.003186, "learning_rate": 0.0004885606273598312, "loss": 0.104883, "step": 90, "value_mse_loss_layer_000": 0.002045, "value_mse_loss_layer_001": 0.007263, "value_mse_loss_layer_002": 0.028442, "value_mse_loss_layer_003": 0.052979, "value_mse_loss_layer_004": 0.044434, "value_mse_loss_layer_005": 0.040771, "value_mse_loss_layer_006": 0.049316, "value_mse_loss_layer_007": 0.052734, "value_mse_loss_layer_008": 0.059814, "value_mse_loss_layer_009": 0.07959, "value_mse_loss_layer_010": 0.072754, "value_mse_loss_layer_011": 0.070801, "value_mse_loss_layer_012": 0.077637, "value_mse_loss_layer_013": 0.077148, "value_mse_loss_layer_014": 0.087402, "value_mse_loss_layer_015": 0.085938, "value_mse_loss_layer_016": 0.087891, "value_mse_loss_layer_017": 0.09082, "value_mse_loss_layer_018": 0.107422, "value_mse_loss_layer_019": 0.108398, "value_mse_loss_layer_020": 0.119141, "value_mse_loss_layer_021": 0.146484, "value_mse_loss_layer_022": 0.115234, "value_mse_loss_layer_023": 0.193359, "value_mse_loss_layer_024": 0.182617, "value_mse_loss_layer_025": 0.273438, "value_mse_loss_layer_026": 0.181641, "value_mse_loss_layer_027": 0.212891, "value_mse_loss_layer_028": 0.22168, "value_mse_loss_layer_029": 0.376953, "value_mse_loss_layer_030": 0.261719, "value_mse_loss_layer_031": 0.318359, "vq_loss_layer_000": 3.9e-05, "vq_loss_layer_001": 3.7e-05, "vq_loss_layer_002": 7.2e-05, "vq_loss_layer_003": 0.000395, "vq_loss_layer_004": 0.000526, "vq_loss_layer_005": 0.000479, "vq_loss_layer_006": 0.00079, "vq_loss_layer_007": 0.001053, "vq_loss_layer_008": 0.001083, "vq_loss_layer_009": 0.001518, "vq_loss_layer_010": 0.001251, "vq_loss_layer_011": 0.001129, "vq_loss_layer_012": 0.001938, "vq_loss_layer_013": 0.001671, "vq_loss_layer_014": 0.001846, "vq_loss_layer_015": 0.002167, "vq_loss_layer_016": 0.002655, "vq_loss_layer_017": 0.00206, "vq_loss_layer_018": 0.001732, "vq_loss_layer_019": 0.001122, "vq_loss_layer_020": 0.001411, "vq_loss_layer_021": 0.002762, "vq_loss_layer_022": 0.001099, "vq_loss_layer_023": 0.00325, "vq_loss_layer_024": 0.001755, "vq_loss_layer_025": 0.003448, "vq_loss_layer_026": 0.00354, "vq_loss_layer_027": 0.003326, "vq_loss_layer_028": 0.003769, "vq_loss_layer_029": 0.00885, "vq_loss_layer_030": 0.008667, "vq_loss_layer_031": 0.019043 }, { "ce_loss": 2.30752, "epoch": 0.0001, "grad_norm": 0.001495473668910563, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.098145, "kv_vq_loss": 0.003017, "learning_rate": 0.0004999999999999999, "loss": 0.10116, "step": 100, "value_mse_loss_layer_000": 0.001999, "value_mse_loss_layer_001": 0.006866, "value_mse_loss_layer_002": 0.0271, "value_mse_loss_layer_003": 0.049561, "value_mse_loss_layer_004": 0.04541, "value_mse_loss_layer_005": 0.04126, "value_mse_loss_layer_006": 0.047607, "value_mse_loss_layer_007": 0.051758, "value_mse_loss_layer_008": 0.057617, "value_mse_loss_layer_009": 0.07666, "value_mse_loss_layer_010": 0.069824, "value_mse_loss_layer_011": 0.068848, "value_mse_loss_layer_012": 0.076172, "value_mse_loss_layer_013": 0.075195, "value_mse_loss_layer_014": 0.084961, "value_mse_loss_layer_015": 0.083008, "value_mse_loss_layer_016": 0.08252, "value_mse_loss_layer_017": 0.083008, "value_mse_loss_layer_018": 0.099121, "value_mse_loss_layer_019": 0.100586, "value_mse_loss_layer_020": 0.10791, "value_mse_loss_layer_021": 0.137695, "value_mse_loss_layer_022": 0.10791, "value_mse_loss_layer_023": 0.168945, "value_mse_loss_layer_024": 0.15625, "value_mse_loss_layer_025": 0.255859, "value_mse_loss_layer_026": 0.165039, "value_mse_loss_layer_027": 0.194336, "value_mse_loss_layer_028": 0.195312, "value_mse_loss_layer_029": 0.349609, "value_mse_loss_layer_030": 0.244141, "value_mse_loss_layer_031": 0.3125, "vq_loss_layer_000": 3.8e-05, "vq_loss_layer_001": 4.9e-05, "vq_loss_layer_002": 8.5e-05, "vq_loss_layer_003": 0.000492, "vq_loss_layer_004": 0.00053, "vq_loss_layer_005": 0.000557, "vq_loss_layer_006": 0.000771, "vq_loss_layer_007": 0.00106, "vq_loss_layer_008": 0.001137, "vq_loss_layer_009": 0.001617, "vq_loss_layer_010": 0.00145, "vq_loss_layer_011": 0.00132, "vq_loss_layer_012": 0.001968, "vq_loss_layer_013": 0.001709, "vq_loss_layer_014": 0.002029, "vq_loss_layer_015": 0.002213, "vq_loss_layer_016": 0.002777, "vq_loss_layer_017": 0.002121, "vq_loss_layer_018": 0.00164, "vq_loss_layer_019": 0.001068, "vq_loss_layer_020": 0.001579, "vq_loss_layer_021": 0.003754, "vq_loss_layer_022": 0.001373, "vq_loss_layer_023": 0.003143, "vq_loss_layer_024": 0.001816, "vq_loss_layer_025": 0.004364, "vq_loss_layer_026": 0.004059, "vq_loss_layer_027": 0.003677, "vq_loss_layer_028": 0.004272, "vq_loss_layer_029": 0.008667, "vq_loss_layer_030": 0.009216, "vq_loss_layer_031": 0.023315 }, { "ce_loss": 2.262941, "epoch": 0.00011, "grad_norm": 0.001139570609666407, "key_mse_loss_layer_000": 0.003677, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.105957, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.088867, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.095483, "kv_vq_loss": 0.002697, "learning_rate": 0.0005103481712895562, "loss": 0.098163, "step": 110, "value_mse_loss_layer_000": 0.001892, "value_mse_loss_layer_001": 0.006622, "value_mse_loss_layer_002": 0.025513, "value_mse_loss_layer_003": 0.04834, "value_mse_loss_layer_004": 0.044434, "value_mse_loss_layer_005": 0.038818, "value_mse_loss_layer_006": 0.044434, "value_mse_loss_layer_007": 0.047852, "value_mse_loss_layer_008": 0.055176, "value_mse_loss_layer_009": 0.069336, "value_mse_loss_layer_010": 0.0625, "value_mse_loss_layer_011": 0.062988, "value_mse_loss_layer_012": 0.069336, "value_mse_loss_layer_013": 0.066406, "value_mse_loss_layer_014": 0.077637, "value_mse_loss_layer_015": 0.072266, "value_mse_loss_layer_016": 0.075684, "value_mse_loss_layer_017": 0.074219, "value_mse_loss_layer_018": 0.097168, "value_mse_loss_layer_019": 0.097168, "value_mse_loss_layer_020": 0.104492, "value_mse_loss_layer_021": 0.125, "value_mse_loss_layer_022": 0.105957, "value_mse_loss_layer_023": 0.172852, "value_mse_loss_layer_024": 0.177734, "value_mse_loss_layer_025": 0.257812, "value_mse_loss_layer_026": 0.19043, "value_mse_loss_layer_027": 0.224609, "value_mse_loss_layer_028": 0.22168, "value_mse_loss_layer_029": 0.394531, "value_mse_loss_layer_030": 0.285156, "value_mse_loss_layer_031": 0.326172, "vq_loss_layer_000": 4.1e-05, "vq_loss_layer_001": 7.8e-05, "vq_loss_layer_002": 7.7e-05, "vq_loss_layer_003": 0.000359, "vq_loss_layer_004": 0.000439, "vq_loss_layer_005": 0.000416, "vq_loss_layer_006": 0.000622, "vq_loss_layer_007": 0.000851, "vq_loss_layer_008": 0.001053, "vq_loss_layer_009": 0.001251, "vq_loss_layer_010": 0.001221, "vq_loss_layer_011": 0.001091, "vq_loss_layer_012": 0.001587, "vq_loss_layer_013": 0.001411, "vq_loss_layer_014": 0.00174, "vq_loss_layer_015": 0.001686, "vq_loss_layer_016": 0.002457, "vq_loss_layer_017": 0.001625, "vq_loss_layer_018": 0.00135, "vq_loss_layer_019": 0.001358, "vq_loss_layer_020": 0.001007, "vq_loss_layer_021": 0.00206, "vq_loss_layer_022": 0.001007, "vq_loss_layer_023": 0.001778, "vq_loss_layer_024": 0.001778, "vq_loss_layer_025": 0.002808, "vq_loss_layer_026": 0.003494, "vq_loss_layer_027": 0.003418, "vq_loss_layer_028": 0.004578, "vq_loss_layer_029": 0.011475, "vq_loss_layer_030": 0.01123, "vq_loss_layer_031": 0.026123 }, { "ce_loss": 2.26671, "epoch": 0.00012, "grad_norm": 0.0017898675287142396, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.092877, "kv_vq_loss": 0.002532, "learning_rate": 0.0005197953115119061, "loss": 0.095428, "step": 120, "value_mse_loss_layer_000": 0.001984, "value_mse_loss_layer_001": 0.006561, "value_mse_loss_layer_002": 0.025757, "value_mse_loss_layer_003": 0.046143, "value_mse_loss_layer_004": 0.040771, "value_mse_loss_layer_005": 0.037598, "value_mse_loss_layer_006": 0.043457, "value_mse_loss_layer_007": 0.047119, "value_mse_loss_layer_008": 0.052734, "value_mse_loss_layer_009": 0.070801, "value_mse_loss_layer_010": 0.063477, "value_mse_loss_layer_011": 0.063477, "value_mse_loss_layer_012": 0.068848, "value_mse_loss_layer_013": 0.067871, "value_mse_loss_layer_014": 0.075684, "value_mse_loss_layer_015": 0.074707, "value_mse_loss_layer_016": 0.072754, "value_mse_loss_layer_017": 0.07666, "value_mse_loss_layer_018": 0.084961, "value_mse_loss_layer_019": 0.09082, "value_mse_loss_layer_020": 0.099609, "value_mse_loss_layer_021": 0.120117, "value_mse_loss_layer_022": 0.098633, "value_mse_loss_layer_023": 0.141602, "value_mse_loss_layer_024": 0.136719, "value_mse_loss_layer_025": 0.236328, "value_mse_loss_layer_026": 0.145508, "value_mse_loss_layer_027": 0.166992, "value_mse_loss_layer_028": 0.173828, "value_mse_loss_layer_029": 0.308594, "value_mse_loss_layer_030": 0.214844, "value_mse_loss_layer_031": 0.291016, "vq_loss_layer_000": 3.6e-05, "vq_loss_layer_001": 5.1e-05, "vq_loss_layer_002": 7.1e-05, "vq_loss_layer_003": 0.000353, "vq_loss_layer_004": 0.000425, "vq_loss_layer_005": 0.000431, "vq_loss_layer_006": 0.000645, "vq_loss_layer_007": 0.000889, "vq_loss_layer_008": 0.000961, "vq_loss_layer_009": 0.00135, "vq_loss_layer_010": 0.001183, "vq_loss_layer_011": 0.001083, "vq_loss_layer_012": 0.001717, "vq_loss_layer_013": 0.001488, "vq_loss_layer_014": 0.001701, "vq_loss_layer_015": 0.001823, "vq_loss_layer_016": 0.001968, "vq_loss_layer_017": 0.001755, "vq_loss_layer_018": 0.001198, "vq_loss_layer_019": 0.000843, "vq_loss_layer_020": 0.001236, "vq_loss_layer_021": 0.002518, "vq_loss_layer_022": 0.001244, "vq_loss_layer_023": 0.00206, "vq_loss_layer_024": 0.001373, "vq_loss_layer_025": 0.003647, "vq_loss_layer_026": 0.003296, "vq_loss_layer_027": 0.002777, "vq_loss_layer_028": 0.003998, "vq_loss_layer_029": 0.008118, "vq_loss_layer_030": 0.008179, "vq_loss_layer_031": 0.020752 }, { "ce_loss": 2.299637, "epoch": 0.00013, "grad_norm": 0.0024067885242402554, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.090363, "kv_vq_loss": 0.002352, "learning_rate": 0.0005284858380767091, "loss": 0.092737, "step": 130, "value_mse_loss_layer_000": 0.001938, "value_mse_loss_layer_001": 0.00647, "value_mse_loss_layer_002": 0.024658, "value_mse_loss_layer_003": 0.043701, "value_mse_loss_layer_004": 0.036621, "value_mse_loss_layer_005": 0.034424, "value_mse_loss_layer_006": 0.040283, "value_mse_loss_layer_007": 0.042969, "value_mse_loss_layer_008": 0.050049, "value_mse_loss_layer_009": 0.06543, "value_mse_loss_layer_010": 0.05957, "value_mse_loss_layer_011": 0.060303, "value_mse_loss_layer_012": 0.064453, "value_mse_loss_layer_013": 0.063965, "value_mse_loss_layer_014": 0.070801, "value_mse_loss_layer_015": 0.071289, "value_mse_loss_layer_016": 0.068359, "value_mse_loss_layer_017": 0.072266, "value_mse_loss_layer_018": 0.080566, "value_mse_loss_layer_019": 0.086914, "value_mse_loss_layer_020": 0.093262, "value_mse_loss_layer_021": 0.115723, "value_mse_loss_layer_022": 0.097168, "value_mse_loss_layer_023": 0.135742, "value_mse_loss_layer_024": 0.132812, "value_mse_loss_layer_025": 0.224609, "value_mse_loss_layer_026": 0.138672, "value_mse_loss_layer_027": 0.168945, "value_mse_loss_layer_028": 0.167969, "value_mse_loss_layer_029": 0.302734, "value_mse_loss_layer_030": 0.216797, "value_mse_loss_layer_031": 0.28125, "vq_loss_layer_000": 3.4e-05, "vq_loss_layer_001": 2.6e-05, "vq_loss_layer_002": 4.3e-05, "vq_loss_layer_003": 0.000265, "vq_loss_layer_004": 0.000336, "vq_loss_layer_005": 0.000347, "vq_loss_layer_006": 0.000561, "vq_loss_layer_007": 0.000763, "vq_loss_layer_008": 0.000813, "vq_loss_layer_009": 0.001175, "vq_loss_layer_010": 0.000957, "vq_loss_layer_011": 0.000889, "vq_loss_layer_012": 0.001488, "vq_loss_layer_013": 0.001198, "vq_loss_layer_014": 0.001411, "vq_loss_layer_015": 0.001572, "vq_loss_layer_016": 0.001793, "vq_loss_layer_017": 0.001495, "vq_loss_layer_018": 0.000969, "vq_loss_layer_019": 0.000713, "vq_loss_layer_020": 0.000977, "vq_loss_layer_021": 0.002075, "vq_loss_layer_022": 0.000935, "vq_loss_layer_023": 0.001671, "vq_loss_layer_024": 0.001183, "vq_loss_layer_025": 0.002411, "vq_loss_layer_026": 0.002411, "vq_loss_layer_027": 0.002426, "vq_loss_layer_028": 0.002747, "vq_loss_layer_029": 0.007172, "vq_loss_layer_030": 0.006958, "vq_loss_layer_031": 0.015991 }, { "ce_loss": 2.282158, "epoch": 0.00014, "grad_norm": 0.002260759240016341, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.088391, "kv_vq_loss": 0.002206, "learning_rate": 0.0005365320089195594, "loss": 0.090546, "step": 140, "value_mse_loss_layer_000": 0.001884, "value_mse_loss_layer_001": 0.006317, "value_mse_loss_layer_002": 0.024048, "value_mse_loss_layer_003": 0.040771, "value_mse_loss_layer_004": 0.035156, "value_mse_loss_layer_005": 0.033203, "value_mse_loss_layer_006": 0.038818, "value_mse_loss_layer_007": 0.04248, "value_mse_loss_layer_008": 0.049561, "value_mse_loss_layer_009": 0.064453, "value_mse_loss_layer_010": 0.058838, "value_mse_loss_layer_011": 0.060547, "value_mse_loss_layer_012": 0.064453, "value_mse_loss_layer_013": 0.064453, "value_mse_loss_layer_014": 0.071777, "value_mse_loss_layer_015": 0.071777, "value_mse_loss_layer_016": 0.069336, "value_mse_loss_layer_017": 0.071777, "value_mse_loss_layer_018": 0.077637, "value_mse_loss_layer_019": 0.086914, "value_mse_loss_layer_020": 0.093262, "value_mse_loss_layer_021": 0.117188, "value_mse_loss_layer_022": 0.098633, "value_mse_loss_layer_023": 0.137695, "value_mse_loss_layer_024": 0.134766, "value_mse_loss_layer_025": 0.213867, "value_mse_loss_layer_026": 0.142578, "value_mse_loss_layer_027": 0.171875, "value_mse_loss_layer_028": 0.173828, "value_mse_loss_layer_029": 0.300781, "value_mse_loss_layer_030": 0.216797, "value_mse_loss_layer_031": 0.275391, "vq_loss_layer_000": 3.4e-05, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 4.7e-05, "vq_loss_layer_003": 0.00024, "vq_loss_layer_004": 0.000294, "vq_loss_layer_005": 0.000343, "vq_loss_layer_006": 0.000534, "vq_loss_layer_007": 0.000767, "vq_loss_layer_008": 0.000809, "vq_loss_layer_009": 0.001137, "vq_loss_layer_010": 0.000999, "vq_loss_layer_011": 0.000908, "vq_loss_layer_012": 0.001511, "vq_loss_layer_013": 0.001366, "vq_loss_layer_014": 0.001495, "vq_loss_layer_015": 0.001762, "vq_loss_layer_016": 0.00193, "vq_loss_layer_017": 0.001564, "vq_loss_layer_018": 0.000992, "vq_loss_layer_019": 0.000782, "vq_loss_layer_020": 0.001091, "vq_loss_layer_021": 0.002106, "vq_loss_layer_022": 0.000946, "vq_loss_layer_023": 0.001617, "vq_loss_layer_024": 0.001221, "vq_loss_layer_025": 0.002213, "vq_loss_layer_026": 0.00267, "vq_loss_layer_027": 0.002411, "vq_loss_layer_028": 0.00296, "vq_loss_layer_029": 0.007446, "vq_loss_layer_030": 0.007599, "vq_loss_layer_031": 0.016357 }, { "ce_loss": 2.264931, "epoch": 0.00015, "grad_norm": 0.002586740767583251, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.08689, "kv_vq_loss": 0.002166, "learning_rate": 0.0005440228147639202, "loss": 0.08905, "step": 150, "value_mse_loss_layer_000": 0.001854, "value_mse_loss_layer_001": 0.006104, "value_mse_loss_layer_002": 0.023315, "value_mse_loss_layer_003": 0.039307, "value_mse_loss_layer_004": 0.033447, "value_mse_loss_layer_005": 0.032227, "value_mse_loss_layer_006": 0.037598, "value_mse_loss_layer_007": 0.040283, "value_mse_loss_layer_008": 0.047119, "value_mse_loss_layer_009": 0.062256, "value_mse_loss_layer_010": 0.057617, "value_mse_loss_layer_011": 0.05835, "value_mse_loss_layer_012": 0.062012, "value_mse_loss_layer_013": 0.061523, "value_mse_loss_layer_014": 0.067383, "value_mse_loss_layer_015": 0.070312, "value_mse_loss_layer_016": 0.063965, "value_mse_loss_layer_017": 0.067871, "value_mse_loss_layer_018": 0.07373, "value_mse_loss_layer_019": 0.081055, "value_mse_loss_layer_020": 0.088867, "value_mse_loss_layer_021": 0.10791, "value_mse_loss_layer_022": 0.090332, "value_mse_loss_layer_023": 0.124023, "value_mse_loss_layer_024": 0.123535, "value_mse_loss_layer_025": 0.193359, "value_mse_loss_layer_026": 0.131836, "value_mse_loss_layer_027": 0.161133, "value_mse_loss_layer_028": 0.157227, "value_mse_loss_layer_029": 0.275391, "value_mse_loss_layer_030": 0.205078, "value_mse_loss_layer_031": 0.263672, "vq_loss_layer_000": 3.2e-05, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 4e-05, "vq_loss_layer_003": 0.000242, "vq_loss_layer_004": 0.000284, "vq_loss_layer_005": 0.000338, "vq_loss_layer_006": 0.000515, "vq_loss_layer_007": 0.000702, "vq_loss_layer_008": 0.000763, "vq_loss_layer_009": 0.001083, "vq_loss_layer_010": 0.000931, "vq_loss_layer_011": 0.000843, "vq_loss_layer_012": 0.001457, "vq_loss_layer_013": 0.001213, "vq_loss_layer_014": 0.001358, "vq_loss_layer_015": 0.001823, "vq_loss_layer_016": 0.001755, "vq_loss_layer_017": 0.001534, "vq_loss_layer_018": 0.000938, "vq_loss_layer_019": 0.000687, "vq_loss_layer_020": 0.000999, "vq_loss_layer_021": 0.001999, "vq_loss_layer_022": 0.000938, "vq_loss_layer_023": 0.00174, "vq_loss_layer_024": 0.001144, "vq_loss_layer_025": 0.002106, "vq_loss_layer_026": 0.002365, "vq_loss_layer_027": 0.002289, "vq_loss_layer_028": 0.002579, "vq_loss_layer_029": 0.006561, "vq_loss_layer_030": 0.006683, "vq_loss_layer_031": 0.01532 }, { "ce_loss": 2.285188, "epoch": 0.00016, "grad_norm": 0.003311865497380495, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.085095, "kv_vq_loss": 0.002006, "learning_rate": 0.000551029995663981, "loss": 0.087134, "step": 160, "value_mse_loss_layer_000": 0.001793, "value_mse_loss_layer_001": 0.006073, "value_mse_loss_layer_002": 0.022705, "value_mse_loss_layer_003": 0.038086, "value_mse_loss_layer_004": 0.033203, "value_mse_loss_layer_005": 0.031494, "value_mse_loss_layer_006": 0.036133, "value_mse_loss_layer_007": 0.039795, "value_mse_loss_layer_008": 0.045898, "value_mse_loss_layer_009": 0.059082, "value_mse_loss_layer_010": 0.053955, "value_mse_loss_layer_011": 0.055664, "value_mse_loss_layer_012": 0.059326, "value_mse_loss_layer_013": 0.059326, "value_mse_loss_layer_014": 0.06543, "value_mse_loss_layer_015": 0.064941, "value_mse_loss_layer_016": 0.062988, "value_mse_loss_layer_017": 0.06543, "value_mse_loss_layer_018": 0.071289, "value_mse_loss_layer_019": 0.081055, "value_mse_loss_layer_020": 0.085938, "value_mse_loss_layer_021": 0.111328, "value_mse_loss_layer_022": 0.089355, "value_mse_loss_layer_023": 0.12207, "value_mse_loss_layer_024": 0.126953, "value_mse_loss_layer_025": 0.1875, "value_mse_loss_layer_026": 0.130859, "value_mse_loss_layer_027": 0.165039, "value_mse_loss_layer_028": 0.163086, "value_mse_loss_layer_029": 0.267578, "value_mse_loss_layer_030": 0.211914, "value_mse_loss_layer_031": 0.265625, "vq_loss_layer_000": 3.4e-05, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 3.6e-05, "vq_loss_layer_003": 0.000197, "vq_loss_layer_004": 0.000278, "vq_loss_layer_005": 0.000309, "vq_loss_layer_006": 0.000467, "vq_loss_layer_007": 0.000668, "vq_loss_layer_008": 0.00071, "vq_loss_layer_009": 0.000992, "vq_loss_layer_010": 0.000854, "vq_loss_layer_011": 0.00087, "vq_loss_layer_012": 0.001366, "vq_loss_layer_013": 0.00119, "vq_loss_layer_014": 0.001358, "vq_loss_layer_015": 0.00145, "vq_loss_layer_016": 0.001724, "vq_loss_layer_017": 0.001289, "vq_loss_layer_018": 0.000778, "vq_loss_layer_019": 0.000675, "vq_loss_layer_020": 0.000824, "vq_loss_layer_021": 0.001869, "vq_loss_layer_022": 0.000763, "vq_loss_layer_023": 0.001343, "vq_loss_layer_024": 0.000992, "vq_loss_layer_025": 0.001579, "vq_loss_layer_026": 0.002167, "vq_loss_layer_027": 0.002304, "vq_loss_layer_028": 0.002625, "vq_loss_layer_029": 0.005249, "vq_loss_layer_030": 0.007172, "vq_loss_layer_031": 0.015259 }, { "ce_loss": 2.286414, "epoch": 0.00017, "grad_norm": 0.0028764549642801285, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.084265, "kv_vq_loss": 0.001929, "learning_rate": 0.0005576122303445685, "loss": 0.086218, "step": 170, "value_mse_loss_layer_000": 0.001839, "value_mse_loss_layer_001": 0.00592, "value_mse_loss_layer_002": 0.022583, "value_mse_loss_layer_003": 0.039307, "value_mse_loss_layer_004": 0.036621, "value_mse_loss_layer_005": 0.03418, "value_mse_loss_layer_006": 0.03833, "value_mse_loss_layer_007": 0.043213, "value_mse_loss_layer_008": 0.048096, "value_mse_loss_layer_009": 0.061768, "value_mse_loss_layer_010": 0.055176, "value_mse_loss_layer_011": 0.057617, "value_mse_loss_layer_012": 0.062256, "value_mse_loss_layer_013": 0.062012, "value_mse_loss_layer_014": 0.068359, "value_mse_loss_layer_015": 0.064453, "value_mse_loss_layer_016": 0.061768, "value_mse_loss_layer_017": 0.066406, "value_mse_loss_layer_018": 0.072266, "value_mse_loss_layer_019": 0.075195, "value_mse_loss_layer_020": 0.085449, "value_mse_loss_layer_021": 0.104492, "value_mse_loss_layer_022": 0.086426, "value_mse_loss_layer_023": 0.131836, "value_mse_loss_layer_024": 0.120605, "value_mse_loss_layer_025": 0.182617, "value_mse_loss_layer_026": 0.148438, "value_mse_loss_layer_027": 0.174805, "value_mse_loss_layer_028": 0.168945, "value_mse_loss_layer_029": 0.269531, "value_mse_loss_layer_030": 0.230469, "value_mse_loss_layer_031": 0.279297, "vq_loss_layer_000": 3.7e-05, "vq_loss_layer_001": 8e-05, "vq_loss_layer_002": 8.2e-05, "vq_loss_layer_003": 0.000243, "vq_loss_layer_004": 0.000328, "vq_loss_layer_005": 0.000357, "vq_loss_layer_006": 0.00053, "vq_loss_layer_007": 0.000751, "vq_loss_layer_008": 0.000927, "vq_loss_layer_009": 0.001167, "vq_loss_layer_010": 0.001068, "vq_loss_layer_011": 0.000942, "vq_loss_layer_012": 0.001434, "vq_loss_layer_013": 0.00135, "vq_loss_layer_014": 0.001617, "vq_loss_layer_015": 0.001503, "vq_loss_layer_016": 0.001755, "vq_loss_layer_017": 0.00148, "vq_loss_layer_018": 0.000999, "vq_loss_layer_019": 0.000683, "vq_loss_layer_020": 0.000851, "vq_loss_layer_021": 0.00209, "vq_loss_layer_022": 0.000923, "vq_loss_layer_023": 0.001839, "vq_loss_layer_024": 0.001129, "vq_loss_layer_025": 0.002075, "vq_loss_layer_026": 0.002914, "vq_loss_layer_027": 0.002716, "vq_loss_layer_028": 0.003632, "vq_loss_layer_029": 0.007446, "vq_loss_layer_030": 0.010193, "vq_loss_layer_031": 0.024048 }, { "ce_loss": 2.298526, "epoch": 0.00018, "grad_norm": 0.004377362783998251, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.136719, "key_mse_loss_layer_014": 0.132812, "key_mse_loss_layer_015": 0.119629, "key_mse_loss_layer_016": 0.118164, "key_mse_loss_layer_017": 0.11377, "key_mse_loss_layer_018": 0.12207, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.110352, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.109375, "key_mse_loss_layer_024": 0.086914, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.096191, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.101074, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.104004, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.082904, "kv_vq_loss": 0.001865, "learning_rate": 0.0005638181262758264, "loss": 0.084778, "step": 180, "value_mse_loss_layer_000": 0.001709, "value_mse_loss_layer_001": 0.005646, "value_mse_loss_layer_002": 0.021362, "value_mse_loss_layer_003": 0.036621, "value_mse_loss_layer_004": 0.033691, "value_mse_loss_layer_005": 0.03125, "value_mse_loss_layer_006": 0.035645, "value_mse_loss_layer_007": 0.039795, "value_mse_loss_layer_008": 0.043945, "value_mse_loss_layer_009": 0.05542, "value_mse_loss_layer_010": 0.051025, "value_mse_loss_layer_011": 0.052002, "value_mse_loss_layer_012": 0.055908, "value_mse_loss_layer_013": 0.053467, "value_mse_loss_layer_014": 0.059082, "value_mse_loss_layer_015": 0.054932, "value_mse_loss_layer_016": 0.052246, "value_mse_loss_layer_017": 0.056396, "value_mse_loss_layer_018": 0.0625, "value_mse_loss_layer_019": 0.070312, "value_mse_loss_layer_020": 0.077148, "value_mse_loss_layer_021": 0.091797, "value_mse_loss_layer_022": 0.075195, "value_mse_loss_layer_023": 0.100586, "value_mse_loss_layer_024": 0.104492, "value_mse_loss_layer_025": 0.148438, "value_mse_loss_layer_026": 0.113281, "value_mse_loss_layer_027": 0.142578, "value_mse_loss_layer_028": 0.138672, "value_mse_loss_layer_029": 0.228516, "value_mse_loss_layer_030": 0.195312, "value_mse_loss_layer_031": 0.261719, "vq_loss_layer_000": 3.5e-05, "vq_loss_layer_001": 5.6e-05, "vq_loss_layer_002": 5.7e-05, "vq_loss_layer_003": 0.000174, "vq_loss_layer_004": 0.000284, "vq_loss_layer_005": 0.00033, "vq_loss_layer_006": 0.000454, "vq_loss_layer_007": 0.000675, "vq_loss_layer_008": 0.00079, "vq_loss_layer_009": 0.000935, "vq_loss_layer_010": 0.000977, "vq_loss_layer_011": 0.000847, "vq_loss_layer_012": 0.001434, "vq_loss_layer_013": 0.000969, "vq_loss_layer_014": 0.001328, "vq_loss_layer_015": 0.001122, "vq_loss_layer_016": 0.001358, "vq_loss_layer_017": 0.001198, "vq_loss_layer_018": 0.000763, "vq_loss_layer_019": 0.000648, "vq_loss_layer_020": 0.000793, "vq_loss_layer_021": 0.001678, "vq_loss_layer_022": 0.000771, "vq_loss_layer_023": 0.001266, "vq_loss_layer_024": 0.001175, "vq_loss_layer_025": 0.001854, "vq_loss_layer_026": 0.002182, "vq_loss_layer_027": 0.002457, "vq_loss_layer_028": 0.003891, "vq_loss_layer_029": 0.005676, "vq_loss_layer_030": 0.008667, "vq_loss_layer_031": 0.020508 }, { "ce_loss": 2.278995, "epoch": 0.00019, "grad_norm": 0.003155197249725461, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.081628, "kv_vq_loss": 0.001794, "learning_rate": 0.0005696884002382071, "loss": 0.083398, "step": 190, "value_mse_loss_layer_000": 0.001732, "value_mse_loss_layer_001": 0.005646, "value_mse_loss_layer_002": 0.020752, "value_mse_loss_layer_003": 0.035889, "value_mse_loss_layer_004": 0.030884, "value_mse_loss_layer_005": 0.029419, "value_mse_loss_layer_006": 0.036377, "value_mse_loss_layer_007": 0.037598, "value_mse_loss_layer_008": 0.043457, "value_mse_loss_layer_009": 0.056396, "value_mse_loss_layer_010": 0.050293, "value_mse_loss_layer_011": 0.052002, "value_mse_loss_layer_012": 0.054932, "value_mse_loss_layer_013": 0.05542, "value_mse_loss_layer_014": 0.061035, "value_mse_loss_layer_015": 0.061279, "value_mse_loss_layer_016": 0.058105, "value_mse_loss_layer_017": 0.061523, "value_mse_loss_layer_018": 0.06543, "value_mse_loss_layer_019": 0.077148, "value_mse_loss_layer_020": 0.081055, "value_mse_loss_layer_021": 0.10498, "value_mse_loss_layer_022": 0.085938, "value_mse_loss_layer_023": 0.11377, "value_mse_loss_layer_024": 0.117188, "value_mse_loss_layer_025": 0.165039, "value_mse_loss_layer_026": 0.123535, "value_mse_loss_layer_027": 0.157227, "value_mse_loss_layer_028": 0.15332, "value_mse_loss_layer_029": 0.243164, "value_mse_loss_layer_030": 0.198242, "value_mse_loss_layer_031": 0.251953, "vq_loss_layer_000": 3e-05, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 0.000172, "vq_loss_layer_004": 0.000254, "vq_loss_layer_005": 0.000269, "vq_loss_layer_006": 0.000504, "vq_loss_layer_007": 0.000599, "vq_loss_layer_008": 0.000645, "vq_loss_layer_009": 0.000923, "vq_loss_layer_010": 0.000721, "vq_loss_layer_011": 0.000706, "vq_loss_layer_012": 0.001236, "vq_loss_layer_013": 0.000954, "vq_loss_layer_014": 0.001144, "vq_loss_layer_015": 0.001274, "vq_loss_layer_016": 0.001427, "vq_loss_layer_017": 0.001083, "vq_loss_layer_018": 0.00066, "vq_loss_layer_019": 0.000607, "vq_loss_layer_020": 0.000675, "vq_loss_layer_021": 0.001717, "vq_loss_layer_022": 0.000698, "vq_loss_layer_023": 0.000999, "vq_loss_layer_024": 0.00079, "vq_loss_layer_025": 0.001396, "vq_loss_layer_026": 0.001816, "vq_loss_layer_027": 0.001884, "vq_loss_layer_028": 0.002289, "vq_loss_layer_029": 0.004425, "vq_loss_layer_030": 0.005524, "vq_loss_layer_031": 0.013794 }, { "ce_loss": 2.268496, "epoch": 0.0002, "grad_norm": 0.006588414311408997, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.080981, "kv_vq_loss": 0.001788, "learning_rate": 0.0005752574989159952, "loss": 0.082721, "step": 200, "value_mse_loss_layer_000": 0.001717, "value_mse_loss_layer_001": 0.005524, "value_mse_loss_layer_002": 0.020508, "value_mse_loss_layer_003": 0.0354, "value_mse_loss_layer_004": 0.030762, "value_mse_loss_layer_005": 0.029907, "value_mse_loss_layer_006": 0.034668, "value_mse_loss_layer_007": 0.037354, "value_mse_loss_layer_008": 0.043457, "value_mse_loss_layer_009": 0.055664, "value_mse_loss_layer_010": 0.050537, "value_mse_loss_layer_011": 0.05249, "value_mse_loss_layer_012": 0.058594, "value_mse_loss_layer_013": 0.055908, "value_mse_loss_layer_014": 0.061523, "value_mse_loss_layer_015": 0.061523, "value_mse_loss_layer_016": 0.058838, "value_mse_loss_layer_017": 0.0625, "value_mse_loss_layer_018": 0.064453, "value_mse_loss_layer_019": 0.072266, "value_mse_loss_layer_020": 0.084961, "value_mse_loss_layer_021": 0.097168, "value_mse_loss_layer_022": 0.07959, "value_mse_loss_layer_023": 0.109863, "value_mse_loss_layer_024": 0.112793, "value_mse_loss_layer_025": 0.168945, "value_mse_loss_layer_026": 0.111816, "value_mse_loss_layer_027": 0.143555, "value_mse_loss_layer_028": 0.15918, "value_mse_loss_layer_029": 0.217773, "value_mse_loss_layer_030": 0.180664, "value_mse_loss_layer_031": 0.246094, "vq_loss_layer_000": 3e-05, "vq_loss_layer_001": 3.1e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 0.00016, "vq_loss_layer_004": 0.000237, "vq_loss_layer_005": 0.000277, "vq_loss_layer_006": 0.000412, "vq_loss_layer_007": 0.000553, "vq_loss_layer_008": 0.000622, "vq_loss_layer_009": 0.00082, "vq_loss_layer_010": 0.000763, "vq_loss_layer_011": 0.00071, "vq_loss_layer_012": 0.001442, "vq_loss_layer_013": 0.001007, "vq_loss_layer_014": 0.001137, "vq_loss_layer_015": 0.001282, "vq_loss_layer_016": 0.001465, "vq_loss_layer_017": 0.001274, "vq_loss_layer_018": 0.000721, "vq_loss_layer_019": 0.000641, "vq_loss_layer_020": 0.000843, "vq_loss_layer_021": 0.001595, "vq_loss_layer_022": 0.000679, "vq_loss_layer_023": 0.001366, "vq_loss_layer_024": 0.00095, "vq_loss_layer_025": 0.001747, "vq_loss_layer_026": 0.001785, "vq_loss_layer_027": 0.00206, "vq_loss_layer_028": 0.003143, "vq_loss_layer_029": 0.0047, "vq_loss_layer_030": 0.005829, "vq_loss_layer_031": 0.014893 }, { "ce_loss": 2.320817, "epoch": 0.00021, "grad_norm": 0.0061269402503967285, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.083984, "kv_mse_loss": 0.07995, "kv_vq_loss": 0.0017, "learning_rate": 0.0005805548236834797, "loss": 0.081598, "step": 210, "value_mse_loss_layer_000": 0.001724, "value_mse_loss_layer_001": 0.005493, "value_mse_loss_layer_002": 0.020508, "value_mse_loss_layer_003": 0.03418, "value_mse_loss_layer_004": 0.030273, "value_mse_loss_layer_005": 0.029541, "value_mse_loss_layer_006": 0.03418, "value_mse_loss_layer_007": 0.037598, "value_mse_loss_layer_008": 0.043213, "value_mse_loss_layer_009": 0.056152, "value_mse_loss_layer_010": 0.050293, "value_mse_loss_layer_011": 0.052246, "value_mse_loss_layer_012": 0.055664, "value_mse_loss_layer_013": 0.056641, "value_mse_loss_layer_014": 0.060791, "value_mse_loss_layer_015": 0.061523, "value_mse_loss_layer_016": 0.059326, "value_mse_loss_layer_017": 0.062012, "value_mse_loss_layer_018": 0.062988, "value_mse_loss_layer_019": 0.075684, "value_mse_loss_layer_020": 0.083984, "value_mse_loss_layer_021": 0.108887, "value_mse_loss_layer_022": 0.081543, "value_mse_loss_layer_023": 0.111328, "value_mse_loss_layer_024": 0.110352, "value_mse_loss_layer_025": 0.165039, "value_mse_loss_layer_026": 0.114258, "value_mse_loss_layer_027": 0.143555, "value_mse_loss_layer_028": 0.147461, "value_mse_loss_layer_029": 0.236328, "value_mse_loss_layer_030": 0.186523, "value_mse_loss_layer_031": 0.240234, "vq_loss_layer_000": 3e-05, "vq_loss_layer_001": 2.5e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 0.000134, "vq_loss_layer_004": 0.000215, "vq_loss_layer_005": 0.000278, "vq_loss_layer_006": 0.000412, "vq_loss_layer_007": 0.00058, "vq_loss_layer_008": 0.000618, "vq_loss_layer_009": 0.000896, "vq_loss_layer_010": 0.000748, "vq_loss_layer_011": 0.000732, "vq_loss_layer_012": 0.001205, "vq_loss_layer_013": 0.00106, "vq_loss_layer_014": 0.001122, "vq_loss_layer_015": 0.001297, "vq_loss_layer_016": 0.001434, "vq_loss_layer_017": 0.00116, "vq_loss_layer_018": 0.000721, "vq_loss_layer_019": 0.000626, "vq_loss_layer_020": 0.000832, "vq_loss_layer_021": 0.001831, "vq_loss_layer_022": 0.000713, "vq_loss_layer_023": 0.001282, "vq_loss_layer_024": 0.001015, "vq_loss_layer_025": 0.001633, "vq_loss_layer_026": 0.002045, "vq_loss_layer_027": 0.002365, "vq_loss_layer_028": 0.003754, "vq_loss_layer_029": 0.008179, "vq_loss_layer_030": 0.007782, "vq_loss_layer_031": 0.018677 }, { "ce_loss": 2.32321, "epoch": 0.00022, "grad_norm": 0.004126644227653742, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.079205, "kv_vq_loss": 0.001706, "learning_rate": 0.0005856056702055515, "loss": 0.080878, "step": 220, "value_mse_loss_layer_000": 0.001701, "value_mse_loss_layer_001": 0.005341, "value_mse_loss_layer_002": 0.019653, "value_mse_loss_layer_003": 0.032715, "value_mse_loss_layer_004": 0.029053, "value_mse_loss_layer_005": 0.028442, "value_mse_loss_layer_006": 0.033447, "value_mse_loss_layer_007": 0.036621, "value_mse_loss_layer_008": 0.041748, "value_mse_loss_layer_009": 0.054443, "value_mse_loss_layer_010": 0.048584, "value_mse_loss_layer_011": 0.052002, "value_mse_loss_layer_012": 0.054932, "value_mse_loss_layer_013": 0.055176, "value_mse_loss_layer_014": 0.061279, "value_mse_loss_layer_015": 0.060791, "value_mse_loss_layer_016": 0.059326, "value_mse_loss_layer_017": 0.05957, "value_mse_loss_layer_018": 0.059814, "value_mse_loss_layer_019": 0.070801, "value_mse_loss_layer_020": 0.077637, "value_mse_loss_layer_021": 0.09375, "value_mse_loss_layer_022": 0.079102, "value_mse_loss_layer_023": 0.101562, "value_mse_loss_layer_024": 0.107422, "value_mse_loss_layer_025": 0.149414, "value_mse_loss_layer_026": 0.113281, "value_mse_loss_layer_027": 0.136719, "value_mse_loss_layer_028": 0.136719, "value_mse_loss_layer_029": 0.208008, "value_mse_loss_layer_030": 0.177734, "value_mse_loss_layer_031": 0.232422, "vq_loss_layer_000": 2.9e-05, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 0.000128, "vq_loss_layer_004": 0.000206, "vq_loss_layer_005": 0.000257, "vq_loss_layer_006": 0.000399, "vq_loss_layer_007": 0.000568, "vq_loss_layer_008": 0.000576, "vq_loss_layer_009": 0.000786, "vq_loss_layer_010": 0.000706, "vq_loss_layer_011": 0.000771, "vq_loss_layer_012": 0.001213, "vq_loss_layer_013": 0.001007, "vq_loss_layer_014": 0.001205, "vq_loss_layer_015": 0.001434, "vq_loss_layer_016": 0.00145, "vq_loss_layer_017": 0.001053, "vq_loss_layer_018": 0.000633, "vq_loss_layer_019": 0.000526, "vq_loss_layer_020": 0.00069, "vq_loss_layer_021": 0.001503, "vq_loss_layer_022": 0.00069, "vq_loss_layer_023": 0.00103, "vq_loss_layer_024": 0.000858, "vq_loss_layer_025": 0.001335, "vq_loss_layer_026": 0.001793, "vq_loss_layer_027": 0.001694, "vq_loss_layer_028": 0.00206, "vq_loss_layer_029": 0.004059, "vq_loss_layer_030": 0.005493, "vq_loss_layer_031": 0.013 }, { "ce_loss": 2.323996, "epoch": 0.00023, "grad_norm": 0.00417805090546608, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.089355, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.078601, "kv_vq_loss": 0.001667, "learning_rate": 0.0005904319590043981, "loss": 0.080255, "step": 230, "value_mse_loss_layer_000": 0.001617, "value_mse_loss_layer_001": 0.00528, "value_mse_loss_layer_002": 0.019653, "value_mse_loss_layer_003": 0.034424, "value_mse_loss_layer_004": 0.033691, "value_mse_loss_layer_005": 0.030029, "value_mse_loss_layer_006": 0.03418, "value_mse_loss_layer_007": 0.036621, "value_mse_loss_layer_008": 0.043945, "value_mse_loss_layer_009": 0.053223, "value_mse_loss_layer_010": 0.047852, "value_mse_loss_layer_011": 0.050049, "value_mse_loss_layer_012": 0.054932, "value_mse_loss_layer_013": 0.05542, "value_mse_loss_layer_014": 0.062012, "value_mse_loss_layer_015": 0.061768, "value_mse_loss_layer_016": 0.057861, "value_mse_loss_layer_017": 0.059326, "value_mse_loss_layer_018": 0.064941, "value_mse_loss_layer_019": 0.072266, "value_mse_loss_layer_020": 0.07666, "value_mse_loss_layer_021": 0.098633, "value_mse_loss_layer_022": 0.080566, "value_mse_loss_layer_023": 0.11084, "value_mse_loss_layer_024": 0.143555, "value_mse_loss_layer_025": 0.163086, "value_mse_loss_layer_026": 0.135742, "value_mse_loss_layer_027": 0.180664, "value_mse_loss_layer_028": 0.166992, "value_mse_loss_layer_029": 0.257812, "value_mse_loss_layer_030": 0.22168, "value_mse_loss_layer_031": 0.269531, "vq_loss_layer_000": 3.1e-05, "vq_loss_layer_001": 6.7e-05, "vq_loss_layer_002": 6.3e-05, "vq_loss_layer_003": 0.000186, "vq_loss_layer_004": 0.00028, "vq_loss_layer_005": 0.000271, "vq_loss_layer_006": 0.000463, "vq_loss_layer_007": 0.000565, "vq_loss_layer_008": 0.000713, "vq_loss_layer_009": 0.000755, "vq_loss_layer_010": 0.000759, "vq_loss_layer_011": 0.000755, "vq_loss_layer_012": 0.001167, "vq_loss_layer_013": 0.00106, "vq_loss_layer_014": 0.001228, "vq_loss_layer_015": 0.001396, "vq_loss_layer_016": 0.001701, "vq_loss_layer_017": 0.001045, "vq_loss_layer_018": 0.000759, "vq_loss_layer_019": 0.000816, "vq_loss_layer_020": 0.000595, "vq_loss_layer_021": 0.001663, "vq_loss_layer_022": 0.00069, "vq_loss_layer_023": 0.000801, "vq_loss_layer_024": 0.001854, "vq_loss_layer_025": 0.001633, "vq_loss_layer_026": 0.002441, "vq_loss_layer_027": 0.00322, "vq_loss_layer_028": 0.003281, "vq_loss_layer_029": 0.006775, "vq_loss_layer_030": 0.008484, "vq_loss_layer_031": 0.022949 }, { "ce_loss": 2.269056, "epoch": 0.00024, "grad_norm": 0.0057611712254583836, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.077972, "kv_vq_loss": 0.001644, "learning_rate": 0.0005950528104279014, "loss": 0.079584, "step": 240, "value_mse_loss_layer_000": 0.00164, "value_mse_loss_layer_001": 0.005188, "value_mse_loss_layer_002": 0.019165, "value_mse_loss_layer_003": 0.032227, "value_mse_loss_layer_004": 0.02832, "value_mse_loss_layer_005": 0.028076, "value_mse_loss_layer_006": 0.033203, "value_mse_loss_layer_007": 0.0354, "value_mse_loss_layer_008": 0.040527, "value_mse_loss_layer_009": 0.053467, "value_mse_loss_layer_010": 0.047363, "value_mse_loss_layer_011": 0.051514, "value_mse_loss_layer_012": 0.053467, "value_mse_loss_layer_013": 0.053711, "value_mse_loss_layer_014": 0.056885, "value_mse_loss_layer_015": 0.057617, "value_mse_loss_layer_016": 0.054443, "value_mse_loss_layer_017": 0.055908, "value_mse_loss_layer_018": 0.060303, "value_mse_loss_layer_019": 0.070801, "value_mse_loss_layer_020": 0.073242, "value_mse_loss_layer_021": 0.090332, "value_mse_loss_layer_022": 0.07959, "value_mse_loss_layer_023": 0.099609, "value_mse_loss_layer_024": 0.105469, "value_mse_loss_layer_025": 0.141602, "value_mse_loss_layer_026": 0.110352, "value_mse_loss_layer_027": 0.140625, "value_mse_loss_layer_028": 0.143555, "value_mse_loss_layer_029": 0.211914, "value_mse_loss_layer_030": 0.19043, "value_mse_loss_layer_031": 0.233398, "vq_loss_layer_000": 2.9e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 0.000122, "vq_loss_layer_004": 0.000194, "vq_loss_layer_005": 0.000257, "vq_loss_layer_006": 0.000441, "vq_loss_layer_007": 0.000576, "vq_loss_layer_008": 0.000553, "vq_loss_layer_009": 0.000832, "vq_loss_layer_010": 0.00069, "vq_loss_layer_011": 0.000835, "vq_loss_layer_012": 0.001244, "vq_loss_layer_013": 0.000984, "vq_loss_layer_014": 0.00103, "vq_loss_layer_015": 0.001167, "vq_loss_layer_016": 0.001305, "vq_loss_layer_017": 0.001038, "vq_loss_layer_018": 0.000629, "vq_loss_layer_019": 0.000565, "vq_loss_layer_020": 0.000614, "vq_loss_layer_021": 0.001289, "vq_loss_layer_022": 0.000694, "vq_loss_layer_023": 0.000896, "vq_loss_layer_024": 0.000744, "vq_loss_layer_025": 0.001129, "vq_loss_layer_026": 0.001587, "vq_loss_layer_027": 0.001732, "vq_loss_layer_028": 0.00238, "vq_loss_layer_029": 0.004486, "vq_loss_layer_030": 0.005768, "vq_loss_layer_031": 0.014099 }, { "ce_loss": 2.346207, "epoch": 0.00025, "grad_norm": 0.0056743617169559, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.077368, "kv_vq_loss": 0.001583, "learning_rate": 0.0005994850021680092, "loss": 0.078931, "step": 250, "value_mse_loss_layer_000": 0.001541, "value_mse_loss_layer_001": 0.005005, "value_mse_loss_layer_002": 0.018433, "value_mse_loss_layer_003": 0.032959, "value_mse_loss_layer_004": 0.029907, "value_mse_loss_layer_005": 0.027588, "value_mse_loss_layer_006": 0.031982, "value_mse_loss_layer_007": 0.034668, "value_mse_loss_layer_008": 0.040527, "value_mse_loss_layer_009": 0.049561, "value_mse_loss_layer_010": 0.044189, "value_mse_loss_layer_011": 0.047119, "value_mse_loss_layer_012": 0.052246, "value_mse_loss_layer_013": 0.049316, "value_mse_loss_layer_014": 0.05542, "value_mse_loss_layer_015": 0.052979, "value_mse_loss_layer_016": 0.051758, "value_mse_loss_layer_017": 0.053955, "value_mse_loss_layer_018": 0.057373, "value_mse_loss_layer_019": 0.071289, "value_mse_loss_layer_020": 0.075195, "value_mse_loss_layer_021": 0.095215, "value_mse_loss_layer_022": 0.078613, "value_mse_loss_layer_023": 0.100586, "value_mse_loss_layer_024": 0.134766, "value_mse_loss_layer_025": 0.149414, "value_mse_loss_layer_026": 0.124023, "value_mse_loss_layer_027": 0.154297, "value_mse_loss_layer_028": 0.144531, "value_mse_loss_layer_029": 0.229492, "value_mse_loss_layer_030": 0.207031, "value_mse_loss_layer_031": 0.251953, "vq_loss_layer_000": 2.8e-05, "vq_loss_layer_001": 4.5e-05, "vq_loss_layer_002": 3.8e-05, "vq_loss_layer_003": 0.000137, "vq_loss_layer_004": 0.000226, "vq_loss_layer_005": 0.000257, "vq_loss_layer_006": 0.000393, "vq_loss_layer_007": 0.000542, "vq_loss_layer_008": 0.000648, "vq_loss_layer_009": 0.000706, "vq_loss_layer_010": 0.000679, "vq_loss_layer_011": 0.000702, "vq_loss_layer_012": 0.001266, "vq_loss_layer_013": 0.000866, "vq_loss_layer_014": 0.001122, "vq_loss_layer_015": 0.001038, "vq_loss_layer_016": 0.001442, "vq_loss_layer_017": 0.000965, "vq_loss_layer_018": 0.000595, "vq_loss_layer_019": 0.000679, "vq_loss_layer_020": 0.000576, "vq_loss_layer_021": 0.001381, "vq_loss_layer_022": 0.000629, "vq_loss_layer_023": 0.000824, "vq_loss_layer_024": 0.00103, "vq_loss_layer_025": 0.001152, "vq_loss_layer_026": 0.001701, "vq_loss_layer_027": 0.00206, "vq_loss_layer_028": 0.00238, "vq_loss_layer_029": 0.004639, "vq_loss_layer_030": 0.006592, "vq_loss_layer_031": 0.018433 }, { "ce_loss": 2.345742, "epoch": 0.00026, "grad_norm": 0.005134271457791328, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.077289, "kv_vq_loss": 0.001579, "learning_rate": 0.0006037433369927045, "loss": 0.07887, "step": 260, "value_mse_loss_layer_000": 0.001648, "value_mse_loss_layer_001": 0.005005, "value_mse_loss_layer_002": 0.018677, "value_mse_loss_layer_003": 0.031982, "value_mse_loss_layer_004": 0.028687, "value_mse_loss_layer_005": 0.027588, "value_mse_loss_layer_006": 0.031982, "value_mse_loss_layer_007": 0.036865, "value_mse_loss_layer_008": 0.040527, "value_mse_loss_layer_009": 0.053223, "value_mse_loss_layer_010": 0.04834, "value_mse_loss_layer_011": 0.050537, "value_mse_loss_layer_012": 0.052979, "value_mse_loss_layer_013": 0.054443, "value_mse_loss_layer_014": 0.06543, "value_mse_loss_layer_015": 0.057373, "value_mse_loss_layer_016": 0.056396, "value_mse_loss_layer_017": 0.056152, "value_mse_loss_layer_018": 0.057861, "value_mse_loss_layer_019": 0.066895, "value_mse_loss_layer_020": 0.072754, "value_mse_loss_layer_021": 0.085938, "value_mse_loss_layer_022": 0.071777, "value_mse_loss_layer_023": 0.089844, "value_mse_loss_layer_024": 0.09668, "value_mse_loss_layer_025": 0.135742, "value_mse_loss_layer_026": 0.100098, "value_mse_loss_layer_027": 0.134766, "value_mse_loss_layer_028": 0.124023, "value_mse_loss_layer_029": 0.188477, "value_mse_loss_layer_030": 0.160156, "value_mse_loss_layer_031": 0.226562, "vq_loss_layer_000": 2.9e-05, "vq_loss_layer_001": 2.9e-05, "vq_loss_layer_002": 3.4e-05, "vq_loss_layer_003": 0.000132, "vq_loss_layer_004": 0.00023, "vq_loss_layer_005": 0.000263, "vq_loss_layer_006": 0.000397, "vq_loss_layer_007": 0.000637, "vq_loss_layer_008": 0.00061, "vq_loss_layer_009": 0.000793, "vq_loss_layer_010": 0.000767, "vq_loss_layer_011": 0.000744, "vq_loss_layer_012": 0.001183, "vq_loss_layer_013": 0.000984, "vq_loss_layer_014": 0.001442, "vq_loss_layer_015": 0.001228, "vq_loss_layer_016": 0.001488, "vq_loss_layer_017": 0.00106, "vq_loss_layer_018": 0.000687, "vq_loss_layer_019": 0.000622, "vq_loss_layer_020": 0.000751, "vq_loss_layer_021": 0.001541, "vq_loss_layer_022": 0.000767, "vq_loss_layer_023": 0.001091, "vq_loss_layer_024": 0.000942, "vq_loss_layer_025": 0.001511, "vq_loss_layer_026": 0.001854, "vq_loss_layer_027": 0.002197, "vq_loss_layer_028": 0.002686, "vq_loss_layer_029": 0.003845, "vq_loss_layer_030": 0.005524, "vq_loss_layer_031": 0.015747 }, { "ce_loss": 2.318319, "epoch": 0.00027, "grad_norm": 0.005634434055536985, "key_mse_loss_layer_000": 0.002655, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.076569, "kv_vq_loss": 0.00152, "learning_rate": 0.0006078409410397468, "loss": 0.078119, "step": 270, "value_mse_loss_layer_000": 0.001556, "value_mse_loss_layer_001": 0.004822, "value_mse_loss_layer_002": 0.018188, "value_mse_loss_layer_003": 0.029663, "value_mse_loss_layer_004": 0.027466, "value_mse_loss_layer_005": 0.026978, "value_mse_loss_layer_006": 0.030762, "value_mse_loss_layer_007": 0.034668, "value_mse_loss_layer_008": 0.038574, "value_mse_loss_layer_009": 0.051758, "value_mse_loss_layer_010": 0.045166, "value_mse_loss_layer_011": 0.048096, "value_mse_loss_layer_012": 0.049072, "value_mse_loss_layer_013": 0.049805, "value_mse_loss_layer_014": 0.054688, "value_mse_loss_layer_015": 0.051758, "value_mse_loss_layer_016": 0.047119, "value_mse_loss_layer_017": 0.052002, "value_mse_loss_layer_018": 0.05542, "value_mse_loss_layer_019": 0.064941, "value_mse_loss_layer_020": 0.067383, "value_mse_loss_layer_021": 0.081055, "value_mse_loss_layer_022": 0.072754, "value_mse_loss_layer_023": 0.088379, "value_mse_loss_layer_024": 0.089844, "value_mse_loss_layer_025": 0.125, "value_mse_loss_layer_026": 0.091309, "value_mse_loss_layer_027": 0.12207, "value_mse_loss_layer_028": 0.121094, "value_mse_loss_layer_029": 0.169922, "value_mse_loss_layer_030": 0.167969, "value_mse_loss_layer_031": 0.226562, "vq_loss_layer_000": 2.7e-05, "vq_loss_layer_001": 2.5e-05, "vq_loss_layer_002": 3.1e-05, "vq_loss_layer_003": 9.7e-05, "vq_loss_layer_004": 0.000231, "vq_loss_layer_005": 0.00032, "vq_loss_layer_006": 0.000397, "vq_loss_layer_007": 0.000568, "vq_loss_layer_008": 0.000595, "vq_loss_layer_009": 0.000843, "vq_loss_layer_010": 0.000668, "vq_loss_layer_011": 0.000683, "vq_loss_layer_012": 0.001137, "vq_loss_layer_013": 0.000843, "vq_loss_layer_014": 0.001129, "vq_loss_layer_015": 0.001022, "vq_loss_layer_016": 0.001053, "vq_loss_layer_017": 0.001167, "vq_loss_layer_018": 0.000622, "vq_loss_layer_019": 0.000565, "vq_loss_layer_020": 0.000687, "vq_loss_layer_021": 0.001404, "vq_loss_layer_022": 0.000816, "vq_loss_layer_023": 0.001274, "vq_loss_layer_024": 0.00082, "vq_loss_layer_025": 0.001305, "vq_loss_layer_026": 0.001595, "vq_loss_layer_027": 0.001846, "vq_loss_layer_028": 0.002365, "vq_loss_layer_029": 0.003143, "vq_loss_layer_030": 0.006348, "vq_loss_layer_031": 0.014832 }, { "ce_loss": 2.289091, "epoch": 0.00028, "grad_norm": 0.004935658536851406, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.075977, "kv_vq_loss": 0.001462, "learning_rate": 0.0006117895078355547, "loss": 0.07746, "step": 280, "value_mse_loss_layer_000": 0.001534, "value_mse_loss_layer_001": 0.004822, "value_mse_loss_layer_002": 0.0177, "value_mse_loss_layer_003": 0.030273, "value_mse_loss_layer_004": 0.026123, "value_mse_loss_layer_005": 0.025513, "value_mse_loss_layer_006": 0.029663, "value_mse_loss_layer_007": 0.032715, "value_mse_loss_layer_008": 0.03833, "value_mse_loss_layer_009": 0.04834, "value_mse_loss_layer_010": 0.044922, "value_mse_loss_layer_011": 0.047607, "value_mse_loss_layer_012": 0.047607, "value_mse_loss_layer_013": 0.04834, "value_mse_loss_layer_014": 0.053223, "value_mse_loss_layer_015": 0.052002, "value_mse_loss_layer_016": 0.049561, "value_mse_loss_layer_017": 0.05249, "value_mse_loss_layer_018": 0.057129, "value_mse_loss_layer_019": 0.067383, "value_mse_loss_layer_020": 0.068848, "value_mse_loss_layer_021": 0.086914, "value_mse_loss_layer_022": 0.074219, "value_mse_loss_layer_023": 0.098145, "value_mse_loss_layer_024": 0.100586, "value_mse_loss_layer_025": 0.134766, "value_mse_loss_layer_026": 0.115234, "value_mse_loss_layer_027": 0.136719, "value_mse_loss_layer_028": 0.129883, "value_mse_loss_layer_029": 0.201172, "value_mse_loss_layer_030": 0.177734, "value_mse_loss_layer_031": 0.225586, "vq_loss_layer_000": 2.6e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 0.0001, "vq_loss_layer_004": 0.000175, "vq_loss_layer_005": 0.000224, "vq_loss_layer_006": 0.000334, "vq_loss_layer_007": 0.000496, "vq_loss_layer_008": 0.000492, "vq_loss_layer_009": 0.000637, "vq_loss_layer_010": 0.000587, "vq_loss_layer_011": 0.000698, "vq_loss_layer_012": 0.001022, "vq_loss_layer_013": 0.00079, "vq_loss_layer_014": 0.000965, "vq_loss_layer_015": 0.000973, "vq_loss_layer_016": 0.001129, "vq_loss_layer_017": 0.0009, "vq_loss_layer_018": 0.000618, "vq_loss_layer_019": 0.000469, "vq_loss_layer_020": 0.000515, "vq_loss_layer_021": 0.00116, "vq_loss_layer_022": 0.000553, "vq_loss_layer_023": 0.000862, "vq_loss_layer_024": 0.000675, "vq_loss_layer_025": 0.000904, "vq_loss_layer_026": 0.001747, "vq_loss_layer_027": 0.001518, "vq_loss_layer_028": 0.001633, "vq_loss_layer_029": 0.003357, "vq_loss_layer_030": 0.004639, "vq_loss_layer_031": 0.012085 }, { "ce_loss": 2.251698, "epoch": 0.00029, "grad_norm": 0.004096552263945341, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.076154, "kv_vq_loss": 0.001525, "learning_rate": 0.0006155994994747389, "loss": 0.07771, "step": 290, "value_mse_loss_layer_000": 0.001572, "value_mse_loss_layer_001": 0.004852, "value_mse_loss_layer_002": 0.017944, "value_mse_loss_layer_003": 0.031006, "value_mse_loss_layer_004": 0.027588, "value_mse_loss_layer_005": 0.026245, "value_mse_loss_layer_006": 0.030273, "value_mse_loss_layer_007": 0.032959, "value_mse_loss_layer_008": 0.038818, "value_mse_loss_layer_009": 0.049561, "value_mse_loss_layer_010": 0.044922, "value_mse_loss_layer_011": 0.047607, "value_mse_loss_layer_012": 0.050537, "value_mse_loss_layer_013": 0.049805, "value_mse_loss_layer_014": 0.054932, "value_mse_loss_layer_015": 0.055908, "value_mse_loss_layer_016": 0.052734, "value_mse_loss_layer_017": 0.05249, "value_mse_loss_layer_018": 0.058838, "value_mse_loss_layer_019": 0.066895, "value_mse_loss_layer_020": 0.070801, "value_mse_loss_layer_021": 0.088379, "value_mse_loss_layer_022": 0.074219, "value_mse_loss_layer_023": 0.095215, "value_mse_loss_layer_024": 0.102539, "value_mse_loss_layer_025": 0.128906, "value_mse_loss_layer_026": 0.10791, "value_mse_loss_layer_027": 0.134766, "value_mse_loss_layer_028": 0.12793, "value_mse_loss_layer_029": 0.201172, "value_mse_loss_layer_030": 0.173828, "value_mse_loss_layer_031": 0.223633, "vq_loss_layer_000": 2.7e-05, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 0.000106, "vq_loss_layer_004": 0.000185, "vq_loss_layer_005": 0.000219, "vq_loss_layer_006": 0.00033, "vq_loss_layer_007": 0.000473, "vq_loss_layer_008": 0.000519, "vq_loss_layer_009": 0.000702, "vq_loss_layer_010": 0.00061, "vq_loss_layer_011": 0.000675, "vq_loss_layer_012": 0.001053, "vq_loss_layer_013": 0.000904, "vq_loss_layer_014": 0.001022, "vq_loss_layer_015": 0.00116, "vq_loss_layer_016": 0.001266, "vq_loss_layer_017": 0.000908, "vq_loss_layer_018": 0.000645, "vq_loss_layer_019": 0.000534, "vq_loss_layer_020": 0.000607, "vq_loss_layer_021": 0.001297, "vq_loss_layer_022": 0.000648, "vq_loss_layer_023": 0.000969, "vq_loss_layer_024": 0.000851, "vq_loss_layer_025": 0.001045, "vq_loss_layer_026": 0.001678, "vq_loss_layer_027": 0.001694, "vq_loss_layer_028": 0.001968, "vq_loss_layer_029": 0.003845, "vq_loss_layer_030": 0.005066, "vq_loss_layer_031": 0.013184 }, { "ce_loss": 2.304254, "epoch": 0.0003, "grad_norm": 0.0036337466444820166, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.075891, "kv_vq_loss": 0.001497, "learning_rate": 0.0006192803136799155, "loss": 0.077454, "step": 300, "value_mse_loss_layer_000": 0.001564, "value_mse_loss_layer_001": 0.00473, "value_mse_loss_layer_002": 0.0177, "value_mse_loss_layer_003": 0.029663, "value_mse_loss_layer_004": 0.026978, "value_mse_loss_layer_005": 0.026489, "value_mse_loss_layer_006": 0.030396, "value_mse_loss_layer_007": 0.033936, "value_mse_loss_layer_008": 0.039795, "value_mse_loss_layer_009": 0.049561, "value_mse_loss_layer_010": 0.045166, "value_mse_loss_layer_011": 0.048096, "value_mse_loss_layer_012": 0.050293, "value_mse_loss_layer_013": 0.05127, "value_mse_loss_layer_014": 0.054932, "value_mse_loss_layer_015": 0.053711, "value_mse_loss_layer_016": 0.052734, "value_mse_loss_layer_017": 0.052246, "value_mse_loss_layer_018": 0.054443, "value_mse_loss_layer_019": 0.063477, "value_mse_loss_layer_020": 0.068359, "value_mse_loss_layer_021": 0.083008, "value_mse_loss_layer_022": 0.069824, "value_mse_loss_layer_023": 0.087891, "value_mse_loss_layer_024": 0.094238, "value_mse_loss_layer_025": 0.123047, "value_mse_loss_layer_026": 0.100098, "value_mse_loss_layer_027": 0.128906, "value_mse_loss_layer_028": 0.122559, "value_mse_loss_layer_029": 0.185547, "value_mse_loss_layer_030": 0.165039, "value_mse_loss_layer_031": 0.223633, "vq_loss_layer_000": 2.6e-05, "vq_loss_layer_001": 2.4e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 0.000114, "vq_loss_layer_004": 0.000183, "vq_loss_layer_005": 0.000236, "vq_loss_layer_006": 0.000349, "vq_loss_layer_007": 0.000515, "vq_loss_layer_008": 0.000641, "vq_loss_layer_009": 0.00069, "vq_loss_layer_010": 0.000664, "vq_loss_layer_011": 0.000702, "vq_loss_layer_012": 0.001068, "vq_loss_layer_013": 0.00116, "vq_loss_layer_014": 0.001083, "vq_loss_layer_015": 0.001015, "vq_loss_layer_016": 0.001396, "vq_loss_layer_017": 0.0009, "vq_loss_layer_018": 0.000591, "vq_loss_layer_019": 0.000519, "vq_loss_layer_020": 0.000652, "vq_loss_layer_021": 0.001335, "vq_loss_layer_022": 0.000614, "vq_loss_layer_023": 0.000862, "vq_loss_layer_024": 0.000805, "vq_loss_layer_025": 0.001045, "vq_loss_layer_026": 0.001526, "vq_loss_layer_027": 0.001778, "vq_loss_layer_028": 0.001999, "vq_loss_layer_029": 0.003372, "vq_loss_layer_030": 0.005463, "vq_loss_layer_031": 0.01416 }, { "ce_loss": 2.271809, "epoch": 0.00031, "grad_norm": 0.005604278761893511, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.074817, "kv_vq_loss": 0.001449, "learning_rate": 0.0006228404234585681, "loss": 0.076312, "step": 310, "value_mse_loss_layer_000": 0.001534, "value_mse_loss_layer_001": 0.004608, "value_mse_loss_layer_002": 0.017212, "value_mse_loss_layer_003": 0.029175, "value_mse_loss_layer_004": 0.026001, "value_mse_loss_layer_005": 0.025757, "value_mse_loss_layer_006": 0.029541, "value_mse_loss_layer_007": 0.032959, "value_mse_loss_layer_008": 0.037842, "value_mse_loss_layer_009": 0.049072, "value_mse_loss_layer_010": 0.044189, "value_mse_loss_layer_011": 0.048096, "value_mse_loss_layer_012": 0.049316, "value_mse_loss_layer_013": 0.05127, "value_mse_loss_layer_014": 0.05542, "value_mse_loss_layer_015": 0.053955, "value_mse_loss_layer_016": 0.050049, "value_mse_loss_layer_017": 0.053467, "value_mse_loss_layer_018": 0.052979, "value_mse_loss_layer_019": 0.064941, "value_mse_loss_layer_020": 0.066895, "value_mse_loss_layer_021": 0.09375, "value_mse_loss_layer_022": 0.070801, "value_mse_loss_layer_023": 0.101074, "value_mse_loss_layer_024": 0.093262, "value_mse_loss_layer_025": 0.125, "value_mse_loss_layer_026": 0.097656, "value_mse_loss_layer_027": 0.125977, "value_mse_loss_layer_028": 0.121094, "value_mse_loss_layer_029": 0.175781, "value_mse_loss_layer_030": 0.157227, "value_mse_loss_layer_031": 0.21875, "vq_loss_layer_000": 2.6e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 8.7e-05, "vq_loss_layer_004": 0.000179, "vq_loss_layer_005": 0.000226, "vq_loss_layer_006": 0.00033, "vq_loss_layer_007": 0.000515, "vq_loss_layer_008": 0.000549, "vq_loss_layer_009": 0.000668, "vq_loss_layer_010": 0.000652, "vq_loss_layer_011": 0.000664, "vq_loss_layer_012": 0.00106, "vq_loss_layer_013": 0.000927, "vq_loss_layer_014": 0.001106, "vq_loss_layer_015": 0.001091, "vq_loss_layer_016": 0.001167, "vq_loss_layer_017": 0.000969, "vq_loss_layer_018": 0.000587, "vq_loss_layer_019": 0.000538, "vq_loss_layer_020": 0.000614, "vq_loss_layer_021": 0.001678, "vq_loss_layer_022": 0.000683, "vq_loss_layer_023": 0.001381, "vq_loss_layer_024": 0.000744, "vq_loss_layer_025": 0.000965, "vq_loss_layer_026": 0.001358, "vq_loss_layer_027": 0.001503, "vq_loss_layer_028": 0.001884, "vq_loss_layer_029": 0.003006, "vq_loss_layer_030": 0.004364, "vq_loss_layer_031": 0.013733 }, { "ce_loss": 2.240761, "epoch": 0.00032, "grad_norm": 0.006903391797095537, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.075043, "kv_vq_loss": 0.00146, "learning_rate": 0.0006262874945799764, "loss": 0.076556, "step": 320, "value_mse_loss_layer_000": 0.001579, "value_mse_loss_layer_001": 0.004669, "value_mse_loss_layer_002": 0.017578, "value_mse_loss_layer_003": 0.028564, "value_mse_loss_layer_004": 0.026245, "value_mse_loss_layer_005": 0.025879, "value_mse_loss_layer_006": 0.029785, "value_mse_loss_layer_007": 0.032959, "value_mse_loss_layer_008": 0.037109, "value_mse_loss_layer_009": 0.048584, "value_mse_loss_layer_010": 0.044189, "value_mse_loss_layer_011": 0.047363, "value_mse_loss_layer_012": 0.049316, "value_mse_loss_layer_013": 0.049805, "value_mse_loss_layer_014": 0.056641, "value_mse_loss_layer_015": 0.052734, "value_mse_loss_layer_016": 0.050537, "value_mse_loss_layer_017": 0.052734, "value_mse_loss_layer_018": 0.052734, "value_mse_loss_layer_019": 0.063965, "value_mse_loss_layer_020": 0.067383, "value_mse_loss_layer_021": 0.078613, "value_mse_loss_layer_022": 0.071777, "value_mse_loss_layer_023": 0.087402, "value_mse_loss_layer_024": 0.091797, "value_mse_loss_layer_025": 0.12793, "value_mse_loss_layer_026": 0.092285, "value_mse_loss_layer_027": 0.125, "value_mse_loss_layer_028": 0.122559, "value_mse_loss_layer_029": 0.185547, "value_mse_loss_layer_030": 0.165039, "value_mse_loss_layer_031": 0.214844, "vq_loss_layer_000": 2.7e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 8.8e-05, "vq_loss_layer_004": 0.000193, "vq_loss_layer_005": 0.000241, "vq_loss_layer_006": 0.000355, "vq_loss_layer_007": 0.000486, "vq_loss_layer_008": 0.000504, "vq_loss_layer_009": 0.000652, "vq_loss_layer_010": 0.000599, "vq_loss_layer_011": 0.000626, "vq_loss_layer_012": 0.001091, "vq_loss_layer_013": 0.000835, "vq_loss_layer_014": 0.001083, "vq_loss_layer_015": 0.001053, "vq_loss_layer_016": 0.001122, "vq_loss_layer_017": 0.000908, "vq_loss_layer_018": 0.000538, "vq_loss_layer_019": 0.00046, "vq_loss_layer_020": 0.000599, "vq_loss_layer_021": 0.001167, "vq_loss_layer_022": 0.000702, "vq_loss_layer_023": 0.000893, "vq_loss_layer_024": 0.00069, "vq_loss_layer_025": 0.001091, "vq_loss_layer_026": 0.001266, "vq_loss_layer_027": 0.00161, "vq_loss_layer_028": 0.001968, "vq_loss_layer_029": 0.003372, "vq_loss_layer_030": 0.004913, "vq_loss_layer_031": 0.012573 }, { "ce_loss": 2.262409, "epoch": 0.00033, "grad_norm": 0.007358509581536055, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.05542, "key_mse_loss_layer_004": 0.065918, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.074353, "kv_vq_loss": 0.00143, "learning_rate": 0.0006296284849694718, "loss": 0.075824, "step": 330, "value_mse_loss_layer_000": 0.001564, "value_mse_loss_layer_001": 0.0047, "value_mse_loss_layer_002": 0.017822, "value_mse_loss_layer_003": 0.028687, "value_mse_loss_layer_004": 0.024536, "value_mse_loss_layer_005": 0.023804, "value_mse_loss_layer_006": 0.028687, "value_mse_loss_layer_007": 0.031738, "value_mse_loss_layer_008": 0.037109, "value_mse_loss_layer_009": 0.046875, "value_mse_loss_layer_010": 0.041992, "value_mse_loss_layer_011": 0.047852, "value_mse_loss_layer_012": 0.047607, "value_mse_loss_layer_013": 0.046875, "value_mse_loss_layer_014": 0.051758, "value_mse_loss_layer_015": 0.052246, "value_mse_loss_layer_016": 0.049072, "value_mse_loss_layer_017": 0.050781, "value_mse_loss_layer_018": 0.054688, "value_mse_loss_layer_019": 0.062988, "value_mse_loss_layer_020": 0.072754, "value_mse_loss_layer_021": 0.099609, "value_mse_loss_layer_022": 0.071289, "value_mse_loss_layer_023": 0.089844, "value_mse_loss_layer_024": 0.093262, "value_mse_loss_layer_025": 0.130859, "value_mse_loss_layer_026": 0.100098, "value_mse_loss_layer_027": 0.12793, "value_mse_loss_layer_028": 0.124023, "value_mse_loss_layer_029": 0.19043, "value_mse_loss_layer_030": 0.166016, "value_mse_loss_layer_031": 0.214844, "vq_loss_layer_000": 2.4e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 7.7e-05, "vq_loss_layer_004": 0.000163, "vq_loss_layer_005": 0.000189, "vq_loss_layer_006": 0.000328, "vq_loss_layer_007": 0.000492, "vq_loss_layer_008": 0.000475, "vq_loss_layer_009": 0.000633, "vq_loss_layer_010": 0.000534, "vq_loss_layer_011": 0.00071, "vq_loss_layer_012": 0.00103, "vq_loss_layer_013": 0.000813, "vq_loss_layer_014": 0.000931, "vq_loss_layer_015": 0.001045, "vq_loss_layer_016": 0.001091, "vq_loss_layer_017": 0.000908, "vq_loss_layer_018": 0.000549, "vq_loss_layer_019": 0.000427, "vq_loss_layer_020": 0.000622, "vq_loss_layer_021": 0.001434, "vq_loss_layer_022": 0.000549, "vq_loss_layer_023": 0.000763, "vq_loss_layer_024": 0.000553, "vq_loss_layer_025": 0.000973, "vq_loss_layer_026": 0.001297, "vq_loss_layer_027": 0.00135, "vq_loss_layer_028": 0.001549, "vq_loss_layer_029": 0.003143, "vq_loss_layer_030": 0.004303, "vq_loss_layer_031": 0.010681 }, { "ce_loss": 2.293657, "epoch": 0.00034, "grad_norm": 0.006010144483298063, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.073633, "kv_vq_loss": 0.001407, "learning_rate": 0.0006328697292605637, "loss": 0.075098, "step": 340, "value_mse_loss_layer_000": 0.001488, "value_mse_loss_layer_001": 0.004517, "value_mse_loss_layer_002": 0.017334, "value_mse_loss_layer_003": 0.028442, "value_mse_loss_layer_004": 0.025513, "value_mse_loss_layer_005": 0.025146, "value_mse_loss_layer_006": 0.028931, "value_mse_loss_layer_007": 0.032715, "value_mse_loss_layer_008": 0.036133, "value_mse_loss_layer_009": 0.047607, "value_mse_loss_layer_010": 0.041992, "value_mse_loss_layer_011": 0.044922, "value_mse_loss_layer_012": 0.047363, "value_mse_loss_layer_013": 0.052246, "value_mse_loss_layer_014": 0.051514, "value_mse_loss_layer_015": 0.051025, "value_mse_loss_layer_016": 0.048584, "value_mse_loss_layer_017": 0.049805, "value_mse_loss_layer_018": 0.053223, "value_mse_loss_layer_019": 0.061279, "value_mse_loss_layer_020": 0.06543, "value_mse_loss_layer_021": 0.084961, "value_mse_loss_layer_022": 0.068359, "value_mse_loss_layer_023": 0.087891, "value_mse_loss_layer_024": 0.092285, "value_mse_loss_layer_025": 0.121582, "value_mse_loss_layer_026": 0.102539, "value_mse_loss_layer_027": 0.142578, "value_mse_loss_layer_028": 0.12207, "value_mse_loss_layer_029": 0.179688, "value_mse_loss_layer_030": 0.162109, "value_mse_loss_layer_031": 0.212891, "vq_loss_layer_000": 2.4e-05, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 9.1e-05, "vq_loss_layer_004": 0.000179, "vq_loss_layer_005": 0.000216, "vq_loss_layer_006": 0.000362, "vq_loss_layer_007": 0.000523, "vq_loss_layer_008": 0.000496, "vq_loss_layer_009": 0.000668, "vq_loss_layer_010": 0.000584, "vq_loss_layer_011": 0.00061, "vq_loss_layer_012": 0.001053, "vq_loss_layer_013": 0.001076, "vq_loss_layer_014": 0.000973, "vq_loss_layer_015": 0.001022, "vq_loss_layer_016": 0.001198, "vq_loss_layer_017": 0.000835, "vq_loss_layer_018": 0.000557, "vq_loss_layer_019": 0.000488, "vq_loss_layer_020": 0.000523, "vq_loss_layer_021": 0.001335, "vq_loss_layer_022": 0.000591, "vq_loss_layer_023": 0.000797, "vq_loss_layer_024": 0.00066, "vq_loss_layer_025": 0.001015, "vq_loss_layer_026": 0.001587, "vq_loss_layer_027": 0.002213, "vq_loss_layer_028": 0.001907, "vq_loss_layer_029": 0.003677, "vq_loss_layer_030": 0.004578, "vq_loss_layer_031": 0.012695 }, { "ce_loss": 2.248736, "epoch": 0.00035, "grad_norm": 0.005647663958370686, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.092285, "key_mse_loss_layer_030": 0.097656, "key_mse_loss_layer_031": 0.088379, "kv_mse_loss": 0.073767, "kv_vq_loss": 0.00141, "learning_rate": 0.0006360170110875688, "loss": 0.075244, "step": 350, "value_mse_loss_layer_000": 0.001488, "value_mse_loss_layer_001": 0.004456, "value_mse_loss_layer_002": 0.016724, "value_mse_loss_layer_003": 0.028442, "value_mse_loss_layer_004": 0.026611, "value_mse_loss_layer_005": 0.026123, "value_mse_loss_layer_006": 0.029541, "value_mse_loss_layer_007": 0.033203, "value_mse_loss_layer_008": 0.037842, "value_mse_loss_layer_009": 0.050293, "value_mse_loss_layer_010": 0.044922, "value_mse_loss_layer_011": 0.046631, "value_mse_loss_layer_012": 0.049316, "value_mse_loss_layer_013": 0.050537, "value_mse_loss_layer_014": 0.056641, "value_mse_loss_layer_015": 0.05127, "value_mse_loss_layer_016": 0.050781, "value_mse_loss_layer_017": 0.051514, "value_mse_loss_layer_018": 0.056885, "value_mse_loss_layer_019": 0.067871, "value_mse_loss_layer_020": 0.067383, "value_mse_loss_layer_021": 0.080566, "value_mse_loss_layer_022": 0.071777, "value_mse_loss_layer_023": 0.108887, "value_mse_loss_layer_024": 0.104004, "value_mse_loss_layer_025": 0.12793, "value_mse_loss_layer_026": 0.109375, "value_mse_loss_layer_027": 0.138672, "value_mse_loss_layer_028": 0.139648, "value_mse_loss_layer_029": 0.204102, "value_mse_loss_layer_030": 0.173828, "value_mse_loss_layer_031": 0.220703, "vq_loss_layer_000": 2.6e-05, "vq_loss_layer_001": 2.8e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 8.2e-05, "vq_loss_layer_004": 0.000175, "vq_loss_layer_005": 0.000218, "vq_loss_layer_006": 0.000322, "vq_loss_layer_007": 0.000484, "vq_loss_layer_008": 0.000526, "vq_loss_layer_009": 0.000729, "vq_loss_layer_010": 0.00066, "vq_loss_layer_011": 0.000607, "vq_loss_layer_012": 0.00106, "vq_loss_layer_013": 0.000912, "vq_loss_layer_014": 0.001015, "vq_loss_layer_015": 0.000885, "vq_loss_layer_016": 0.001099, "vq_loss_layer_017": 0.000786, "vq_loss_layer_018": 0.000717, "vq_loss_layer_019": 0.000546, "vq_loss_layer_020": 0.000471, "vq_loss_layer_021": 0.000881, "vq_loss_layer_022": 0.000546, "vq_loss_layer_023": 0.000984, "vq_loss_layer_024": 0.000874, "vq_loss_layer_025": 0.00116, "vq_loss_layer_026": 0.001678, "vq_loss_layer_027": 0.002213, "vq_loss_layer_028": 0.00322, "vq_loss_layer_029": 0.008301, "vq_loss_layer_030": 0.007812, "vq_loss_layer_031": 0.021606 }, { "ce_loss": 2.280369, "epoch": 0.00036, "grad_norm": 0.005082201212644577, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.073309, "kv_vq_loss": 0.001339, "learning_rate": 0.0006390756251918217, "loss": 0.07467, "step": 360, "value_mse_loss_layer_000": 0.001526, "value_mse_loss_layer_001": 0.004425, "value_mse_loss_layer_002": 0.016602, "value_mse_loss_layer_003": 0.029907, "value_mse_loss_layer_004": 0.024048, "value_mse_loss_layer_005": 0.025391, "value_mse_loss_layer_006": 0.028687, "value_mse_loss_layer_007": 0.031982, "value_mse_loss_layer_008": 0.036621, "value_mse_loss_layer_009": 0.046875, "value_mse_loss_layer_010": 0.04248, "value_mse_loss_layer_011": 0.045654, "value_mse_loss_layer_012": 0.047363, "value_mse_loss_layer_013": 0.048584, "value_mse_loss_layer_014": 0.051758, "value_mse_loss_layer_015": 0.053467, "value_mse_loss_layer_016": 0.051758, "value_mse_loss_layer_017": 0.050293, "value_mse_loss_layer_018": 0.051514, "value_mse_loss_layer_019": 0.0625, "value_mse_loss_layer_020": 0.064941, "value_mse_loss_layer_021": 0.08252, "value_mse_loss_layer_022": 0.071289, "value_mse_loss_layer_023": 0.089355, "value_mse_loss_layer_024": 0.088379, "value_mse_loss_layer_025": 0.12207, "value_mse_loss_layer_026": 0.092773, "value_mse_loss_layer_027": 0.123047, "value_mse_loss_layer_028": 0.119141, "value_mse_loss_layer_029": 0.170898, "value_mse_loss_layer_030": 0.154297, "value_mse_loss_layer_031": 0.208984, "vq_loss_layer_000": 2.4e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 8.9e-05, "vq_loss_layer_004": 0.000167, "vq_loss_layer_005": 0.000244, "vq_loss_layer_006": 0.000349, "vq_loss_layer_007": 0.000496, "vq_loss_layer_008": 0.000469, "vq_loss_layer_009": 0.000607, "vq_loss_layer_010": 0.00058, "vq_loss_layer_011": 0.00061, "vq_loss_layer_012": 0.00103, "vq_loss_layer_013": 0.000874, "vq_loss_layer_014": 0.000961, "vq_loss_layer_015": 0.001114, "vq_loss_layer_016": 0.001213, "vq_loss_layer_017": 0.000809, "vq_loss_layer_018": 0.00053, "vq_loss_layer_019": 0.000484, "vq_loss_layer_020": 0.000568, "vq_loss_layer_021": 0.00116, "vq_loss_layer_022": 0.000637, "vq_loss_layer_023": 0.000862, "vq_loss_layer_024": 0.000557, "vq_loss_layer_025": 0.000881, "vq_loss_layer_026": 0.00116, "vq_loss_layer_027": 0.001404, "vq_loss_layer_028": 0.001625, "vq_loss_layer_029": 0.002792, "vq_loss_layer_030": 0.003998, "vq_loss_layer_031": 0.011292 }, { "ce_loss": 2.221364, "epoch": 0.00037, "grad_norm": 0.007326270919293165, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.0737, "kv_vq_loss": 0.001423, "learning_rate": 0.0006420504310167487, "loss": 0.075153, "step": 370, "value_mse_loss_layer_000": 0.001503, "value_mse_loss_layer_001": 0.004395, "value_mse_loss_layer_002": 0.016479, "value_mse_loss_layer_003": 0.027344, "value_mse_loss_layer_004": 0.02478, "value_mse_loss_layer_005": 0.02478, "value_mse_loss_layer_006": 0.028564, "value_mse_loss_layer_007": 0.031738, "value_mse_loss_layer_008": 0.035645, "value_mse_loss_layer_009": 0.048096, "value_mse_loss_layer_010": 0.042236, "value_mse_loss_layer_011": 0.044434, "value_mse_loss_layer_012": 0.04541, "value_mse_loss_layer_013": 0.046387, "value_mse_loss_layer_014": 0.050293, "value_mse_loss_layer_015": 0.049316, "value_mse_loss_layer_016": 0.046631, "value_mse_loss_layer_017": 0.050049, "value_mse_loss_layer_018": 0.054932, "value_mse_loss_layer_019": 0.059814, "value_mse_loss_layer_020": 0.06543, "value_mse_loss_layer_021": 0.077637, "value_mse_loss_layer_022": 0.069336, "value_mse_loss_layer_023": 0.084961, "value_mse_loss_layer_024": 0.089355, "value_mse_loss_layer_025": 0.121582, "value_mse_loss_layer_026": 0.096191, "value_mse_loss_layer_027": 0.122559, "value_mse_loss_layer_028": 0.12207, "value_mse_loss_layer_029": 0.173828, "value_mse_loss_layer_030": 0.180664, "value_mse_loss_layer_031": 0.208008, "vq_loss_layer_000": 2.4e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 8.4e-05, "vq_loss_layer_004": 0.00016, "vq_loss_layer_005": 0.000209, "vq_loss_layer_006": 0.000322, "vq_loss_layer_007": 0.000456, "vq_loss_layer_008": 0.000479, "vq_loss_layer_009": 0.00069, "vq_loss_layer_010": 0.000549, "vq_loss_layer_011": 0.000584, "vq_loss_layer_012": 0.000931, "vq_loss_layer_013": 0.000751, "vq_loss_layer_014": 0.000942, "vq_loss_layer_015": 0.0009, "vq_loss_layer_016": 0.001038, "vq_loss_layer_017": 0.000973, "vq_loss_layer_018": 0.000652, "vq_loss_layer_019": 0.000412, "vq_loss_layer_020": 0.000519, "vq_loss_layer_021": 0.001106, "vq_loss_layer_022": 0.000603, "vq_loss_layer_023": 0.000744, "vq_loss_layer_024": 0.000648, "vq_loss_layer_025": 0.000881, "vq_loss_layer_026": 0.001297, "vq_loss_layer_027": 0.001457, "vq_loss_layer_028": 0.0019, "vq_loss_layer_029": 0.002991, "vq_loss_layer_030": 0.005127, "vq_loss_layer_031": 0.011902 }, { "ce_loss": 2.273028, "epoch": 0.00038, "grad_norm": 0.005229900125414133, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.072833, "kv_vq_loss": 0.001335, "learning_rate": 0.0006449458991542024, "loss": 0.074219, "step": 380, "value_mse_loss_layer_000": 0.00148, "value_mse_loss_layer_001": 0.004303, "value_mse_loss_layer_002": 0.016846, "value_mse_loss_layer_003": 0.026245, "value_mse_loss_layer_004": 0.02417, "value_mse_loss_layer_005": 0.024414, "value_mse_loss_layer_006": 0.026978, "value_mse_loss_layer_007": 0.03125, "value_mse_loss_layer_008": 0.034668, "value_mse_loss_layer_009": 0.046143, "value_mse_loss_layer_010": 0.040527, "value_mse_loss_layer_011": 0.044678, "value_mse_loss_layer_012": 0.044434, "value_mse_loss_layer_013": 0.047363, "value_mse_loss_layer_014": 0.053223, "value_mse_loss_layer_015": 0.049561, "value_mse_loss_layer_016": 0.047607, "value_mse_loss_layer_017": 0.048096, "value_mse_loss_layer_018": 0.051025, "value_mse_loss_layer_019": 0.060303, "value_mse_loss_layer_020": 0.064453, "value_mse_loss_layer_021": 0.075684, "value_mse_loss_layer_022": 0.066895, "value_mse_loss_layer_023": 0.083496, "value_mse_loss_layer_024": 0.086914, "value_mse_loss_layer_025": 0.11084, "value_mse_loss_layer_026": 0.089844, "value_mse_loss_layer_027": 0.117676, "value_mse_loss_layer_028": 0.115234, "value_mse_loss_layer_029": 0.175781, "value_mse_loss_layer_030": 0.158203, "value_mse_loss_layer_031": 0.201172, "vq_loss_layer_000": 2.3e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 7e-05, "vq_loss_layer_004": 0.000185, "vq_loss_layer_005": 0.000226, "vq_loss_layer_006": 0.000292, "vq_loss_layer_007": 0.000463, "vq_loss_layer_008": 0.00045, "vq_loss_layer_009": 0.000641, "vq_loss_layer_010": 0.000526, "vq_loss_layer_011": 0.000599, "vq_loss_layer_012": 0.000973, "vq_loss_layer_013": 0.000893, "vq_loss_layer_014": 0.001175, "vq_loss_layer_015": 0.000946, "vq_loss_layer_016": 0.001091, "vq_loss_layer_017": 0.000771, "vq_loss_layer_018": 0.000561, "vq_loss_layer_019": 0.00045, "vq_loss_layer_020": 0.000576, "vq_loss_layer_021": 0.001122, "vq_loss_layer_022": 0.000557, "vq_loss_layer_023": 0.000786, "vq_loss_layer_024": 0.000637, "vq_loss_layer_025": 0.000805, "vq_loss_layer_026": 0.001106, "vq_loss_layer_027": 0.001266, "vq_loss_layer_028": 0.001526, "vq_loss_layer_029": 0.002869, "vq_loss_layer_030": 0.004303, "vq_loss_layer_031": 0.010803 }, { "ce_loss": 2.276522, "epoch": 0.00039, "grad_norm": 0.005995756946504116, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.072443, "kv_vq_loss": 0.001352, "learning_rate": 0.0006477661517566247, "loss": 0.073816, "step": 390, "value_mse_loss_layer_000": 0.001457, "value_mse_loss_layer_001": 0.004272, "value_mse_loss_layer_002": 0.015869, "value_mse_loss_layer_003": 0.026733, "value_mse_loss_layer_004": 0.023926, "value_mse_loss_layer_005": 0.023926, "value_mse_loss_layer_006": 0.028198, "value_mse_loss_layer_007": 0.030762, "value_mse_loss_layer_008": 0.034424, "value_mse_loss_layer_009": 0.044678, "value_mse_loss_layer_010": 0.042236, "value_mse_loss_layer_011": 0.043213, "value_mse_loss_layer_012": 0.046143, "value_mse_loss_layer_013": 0.04541, "value_mse_loss_layer_014": 0.048584, "value_mse_loss_layer_015": 0.048584, "value_mse_loss_layer_016": 0.047119, "value_mse_loss_layer_017": 0.049316, "value_mse_loss_layer_018": 0.049072, "value_mse_loss_layer_019": 0.060059, "value_mse_loss_layer_020": 0.060059, "value_mse_loss_layer_021": 0.07666, "value_mse_loss_layer_022": 0.06543, "value_mse_loss_layer_023": 0.083008, "value_mse_loss_layer_024": 0.089355, "value_mse_loss_layer_025": 0.114746, "value_mse_loss_layer_026": 0.090332, "value_mse_loss_layer_027": 0.117676, "value_mse_loss_layer_028": 0.117676, "value_mse_loss_layer_029": 0.171875, "value_mse_loss_layer_030": 0.154297, "value_mse_loss_layer_031": 0.208008, "vq_loss_layer_000": 2.4e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 8.2e-05, "vq_loss_layer_004": 0.000149, "vq_loss_layer_005": 0.000206, "vq_loss_layer_006": 0.00036, "vq_loss_layer_007": 0.00046, "vq_loss_layer_008": 0.000452, "vq_loss_layer_009": 0.000576, "vq_loss_layer_010": 0.000614, "vq_loss_layer_011": 0.000584, "vq_loss_layer_012": 0.001038, "vq_loss_layer_013": 0.000771, "vq_loss_layer_014": 0.000919, "vq_loss_layer_015": 0.000916, "vq_loss_layer_016": 0.00116, "vq_loss_layer_017": 0.000896, "vq_loss_layer_018": 0.000483, "vq_loss_layer_019": 0.000471, "vq_loss_layer_020": 0.000542, "vq_loss_layer_021": 0.001175, "vq_loss_layer_022": 0.000553, "vq_loss_layer_023": 0.000793, "vq_loss_layer_024": 0.00066, "vq_loss_layer_025": 0.000923, "vq_loss_layer_026": 0.001205, "vq_loss_layer_027": 0.001335, "vq_loss_layer_028": 0.001862, "vq_loss_layer_029": 0.002716, "vq_loss_layer_030": 0.004486, "vq_loss_layer_031": 0.012329 }, { "ce_loss": 2.271442, "epoch": 0.0004, "grad_norm": 0.00642418023198843, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.072577, "kv_vq_loss": 0.001313, "learning_rate": 0.0006505149978319906, "loss": 0.073926, "step": 400, "value_mse_loss_layer_000": 0.001465, "value_mse_loss_layer_001": 0.004242, "value_mse_loss_layer_002": 0.015869, "value_mse_loss_layer_003": 0.026367, "value_mse_loss_layer_004": 0.02356, "value_mse_loss_layer_005": 0.024048, "value_mse_loss_layer_006": 0.027344, "value_mse_loss_layer_007": 0.030762, "value_mse_loss_layer_008": 0.036133, "value_mse_loss_layer_009": 0.047607, "value_mse_loss_layer_010": 0.041016, "value_mse_loss_layer_011": 0.044189, "value_mse_loss_layer_012": 0.045898, "value_mse_loss_layer_013": 0.046143, "value_mse_loss_layer_014": 0.049072, "value_mse_loss_layer_015": 0.049072, "value_mse_loss_layer_016": 0.046875, "value_mse_loss_layer_017": 0.048096, "value_mse_loss_layer_018": 0.048584, "value_mse_loss_layer_019": 0.059082, "value_mse_loss_layer_020": 0.062988, "value_mse_loss_layer_021": 0.077637, "value_mse_loss_layer_022": 0.068359, "value_mse_loss_layer_023": 0.08252, "value_mse_loss_layer_024": 0.091309, "value_mse_loss_layer_025": 0.117188, "value_mse_loss_layer_026": 0.091797, "value_mse_loss_layer_027": 0.132812, "value_mse_loss_layer_028": 0.115234, "value_mse_loss_layer_029": 0.167969, "value_mse_loss_layer_030": 0.149414, "value_mse_loss_layer_031": 0.204102, "vq_loss_layer_000": 2.3e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 7.4e-05, "vq_loss_layer_004": 0.000151, "vq_loss_layer_005": 0.000209, "vq_loss_layer_006": 0.000303, "vq_loss_layer_007": 0.000456, "vq_loss_layer_008": 0.000519, "vq_loss_layer_009": 0.00074, "vq_loss_layer_010": 0.000576, "vq_loss_layer_011": 0.000622, "vq_loss_layer_012": 0.001038, "vq_loss_layer_013": 0.000824, "vq_loss_layer_014": 0.000946, "vq_loss_layer_015": 0.00095, "vq_loss_layer_016": 0.001129, "vq_loss_layer_017": 0.000805, "vq_loss_layer_018": 0.000492, "vq_loss_layer_019": 0.000444, "vq_loss_layer_020": 0.000614, "vq_loss_layer_021": 0.001137, "vq_loss_layer_022": 0.000698, "vq_loss_layer_023": 0.000774, "vq_loss_layer_024": 0.000652, "vq_loss_layer_025": 0.000839, "vq_loss_layer_026": 0.001289, "vq_loss_layer_027": 0.001968, "vq_loss_layer_028": 0.001762, "vq_loss_layer_029": 0.002914, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.012146 }, { "ce_loss": 2.299244, "epoch": 0.00041, "grad_norm": 0.004565058276057243, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.072174, "kv_vq_loss": 0.001268, "learning_rate": 0.0006531959641799339, "loss": 0.073419, "step": 410, "value_mse_loss_layer_000": 0.001472, "value_mse_loss_layer_001": 0.004211, "value_mse_loss_layer_002": 0.016113, "value_mse_loss_layer_003": 0.026611, "value_mse_loss_layer_004": 0.02356, "value_mse_loss_layer_005": 0.02417, "value_mse_loss_layer_006": 0.026855, "value_mse_loss_layer_007": 0.03125, "value_mse_loss_layer_008": 0.034912, "value_mse_loss_layer_009": 0.046143, "value_mse_loss_layer_010": 0.039795, "value_mse_loss_layer_011": 0.043945, "value_mse_loss_layer_012": 0.044678, "value_mse_loss_layer_013": 0.046631, "value_mse_loss_layer_014": 0.056885, "value_mse_loss_layer_015": 0.049072, "value_mse_loss_layer_016": 0.047607, "value_mse_loss_layer_017": 0.048096, "value_mse_loss_layer_018": 0.048828, "value_mse_loss_layer_019": 0.058838, "value_mse_loss_layer_020": 0.061279, "value_mse_loss_layer_021": 0.076172, "value_mse_loss_layer_022": 0.06543, "value_mse_loss_layer_023": 0.082031, "value_mse_loss_layer_024": 0.087402, "value_mse_loss_layer_025": 0.110352, "value_mse_loss_layer_026": 0.088867, "value_mse_loss_layer_027": 0.122559, "value_mse_loss_layer_028": 0.112793, "value_mse_loss_layer_029": 0.166992, "value_mse_loss_layer_030": 0.154297, "value_mse_loss_layer_031": 0.199219, "vq_loss_layer_000": 2.3e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 8.7e-05, "vq_loss_layer_004": 0.000148, "vq_loss_layer_005": 0.000231, "vq_loss_layer_006": 0.000294, "vq_loss_layer_007": 0.000486, "vq_loss_layer_008": 0.000454, "vq_loss_layer_009": 0.000622, "vq_loss_layer_010": 0.000519, "vq_loss_layer_011": 0.000595, "vq_loss_layer_012": 0.000946, "vq_loss_layer_013": 0.000832, "vq_loss_layer_014": 0.001106, "vq_loss_layer_015": 0.000961, "vq_loss_layer_016": 0.001129, "vq_loss_layer_017": 0.000816, "vq_loss_layer_018": 0.000488, "vq_loss_layer_019": 0.000469, "vq_loss_layer_020": 0.00058, "vq_loss_layer_021": 0.001205, "vq_loss_layer_022": 0.000561, "vq_loss_layer_023": 0.000782, "vq_loss_layer_024": 0.000633, "vq_loss_layer_025": 0.000835, "vq_loss_layer_026": 0.00116, "vq_loss_layer_027": 0.001503, "vq_loss_layer_028": 0.001587, "vq_loss_layer_029": 0.002777, "vq_loss_layer_030": 0.004364, "vq_loss_layer_031": 0.011475 }, { "ce_loss": 2.294049, "epoch": 0.00042, "grad_norm": 0.005925820209085941, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.10791, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.072076, "kv_vq_loss": 0.001328, "learning_rate": 0.000655812322599475, "loss": 0.073407, "step": 420, "value_mse_loss_layer_000": 0.00145, "value_mse_loss_layer_001": 0.00415, "value_mse_loss_layer_002": 0.017944, "value_mse_loss_layer_003": 0.026611, "value_mse_loss_layer_004": 0.026245, "value_mse_loss_layer_005": 0.02417, "value_mse_loss_layer_006": 0.029053, "value_mse_loss_layer_007": 0.030518, "value_mse_loss_layer_008": 0.033691, "value_mse_loss_layer_009": 0.044189, "value_mse_loss_layer_010": 0.041992, "value_mse_loss_layer_011": 0.042725, "value_mse_loss_layer_012": 0.043213, "value_mse_loss_layer_013": 0.044189, "value_mse_loss_layer_014": 0.04834, "value_mse_loss_layer_015": 0.043945, "value_mse_loss_layer_016": 0.042969, "value_mse_loss_layer_017": 0.045654, "value_mse_loss_layer_018": 0.051514, "value_mse_loss_layer_019": 0.055176, "value_mse_loss_layer_020": 0.060303, "value_mse_loss_layer_021": 0.067871, "value_mse_loss_layer_022": 0.063477, "value_mse_loss_layer_023": 0.077148, "value_mse_loss_layer_024": 0.08252, "value_mse_loss_layer_025": 0.10791, "value_mse_loss_layer_026": 0.083008, "value_mse_loss_layer_027": 0.10791, "value_mse_loss_layer_028": 0.107422, "value_mse_loss_layer_029": 0.154297, "value_mse_loss_layer_030": 0.147461, "value_mse_loss_layer_031": 0.209961, "vq_loss_layer_000": 2.3e-05, "vq_loss_layer_001": 3.3e-05, "vq_loss_layer_002": 3.7e-05, "vq_loss_layer_003": 8.8e-05, "vq_loss_layer_004": 0.000252, "vq_loss_layer_005": 0.000216, "vq_loss_layer_006": 0.000416, "vq_loss_layer_007": 0.000462, "vq_loss_layer_008": 0.000481, "vq_loss_layer_009": 0.000591, "vq_loss_layer_010": 0.000629, "vq_loss_layer_011": 0.000603, "vq_loss_layer_012": 0.000999, "vq_loss_layer_013": 0.000809, "vq_loss_layer_014": 0.001015, "vq_loss_layer_015": 0.00079, "vq_loss_layer_016": 0.000973, "vq_loss_layer_017": 0.000771, "vq_loss_layer_018": 0.00058, "vq_loss_layer_019": 0.000437, "vq_loss_layer_020": 0.000553, "vq_loss_layer_021": 0.001038, "vq_loss_layer_022": 0.000652, "vq_loss_layer_023": 0.000858, "vq_loss_layer_024": 0.000717, "vq_loss_layer_025": 0.001129, "vq_loss_layer_026": 0.001358, "vq_loss_layer_027": 0.001305, "vq_loss_layer_028": 0.001938, "vq_loss_layer_029": 0.002701, "vq_loss_layer_030": 0.004761, "vq_loss_layer_031": 0.014526 }, { "ce_loss": 2.301712, "epoch": 0.00043, "grad_norm": 0.007492033299058676, "key_mse_loss_layer_000": 0.003891, "key_mse_loss_layer_001": 0.011414, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.106934, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.071887, "kv_vq_loss": 0.001313, "learning_rate": 0.0006583671138948965, "loss": 0.073163, "step": 430, "value_mse_loss_layer_000": 0.001434, "value_mse_loss_layer_001": 0.004181, "value_mse_loss_layer_002": 0.016235, "value_mse_loss_layer_003": 0.026978, "value_mse_loss_layer_004": 0.025024, "value_mse_loss_layer_005": 0.024414, "value_mse_loss_layer_006": 0.026978, "value_mse_loss_layer_007": 0.030273, "value_mse_loss_layer_008": 0.034424, "value_mse_loss_layer_009": 0.042969, "value_mse_loss_layer_010": 0.038086, "value_mse_loss_layer_011": 0.040771, "value_mse_loss_layer_012": 0.043213, "value_mse_loss_layer_013": 0.04126, "value_mse_loss_layer_014": 0.046387, "value_mse_loss_layer_015": 0.047119, "value_mse_loss_layer_016": 0.044189, "value_mse_loss_layer_017": 0.044922, "value_mse_loss_layer_018": 0.051758, "value_mse_loss_layer_019": 0.058105, "value_mse_loss_layer_020": 0.074707, "value_mse_loss_layer_021": 0.077637, "value_mse_loss_layer_022": 0.064941, "value_mse_loss_layer_023": 0.08252, "value_mse_loss_layer_024": 0.115723, "value_mse_loss_layer_025": 0.119141, "value_mse_loss_layer_026": 0.099609, "value_mse_loss_layer_027": 0.129883, "value_mse_loss_layer_028": 0.117188, "value_mse_loss_layer_029": 0.188477, "value_mse_loss_layer_030": 0.169922, "value_mse_loss_layer_031": 0.21875, "vq_loss_layer_000": 2.7e-05, "vq_loss_layer_001": 3.8e-05, "vq_loss_layer_002": 4.2e-05, "vq_loss_layer_003": 7.8e-05, "vq_loss_layer_004": 0.000165, "vq_loss_layer_005": 0.000211, "vq_loss_layer_006": 0.000301, "vq_loss_layer_007": 0.000412, "vq_loss_layer_008": 0.000469, "vq_loss_layer_009": 0.000568, "vq_loss_layer_010": 0.000568, "vq_loss_layer_011": 0.000584, "vq_loss_layer_012": 0.000889, "vq_loss_layer_013": 0.000668, "vq_loss_layer_014": 0.000881, "vq_loss_layer_015": 0.001472, "vq_loss_layer_016": 0.001007, "vq_loss_layer_017": 0.00079, "vq_loss_layer_018": 0.000603, "vq_loss_layer_019": 0.000481, "vq_loss_layer_020": 0.000599, "vq_loss_layer_021": 0.00106, "vq_loss_layer_022": 0.00053, "vq_loss_layer_023": 0.000664, "vq_loss_layer_024": 0.001053, "vq_loss_layer_025": 0.001045, "vq_loss_layer_026": 0.001457, "vq_loss_layer_027": 0.00174, "vq_loss_layer_028": 0.002045, "vq_loss_layer_029": 0.004517, "vq_loss_layer_030": 0.0065, "vq_loss_layer_031": 0.0177 }, { "ce_loss": 2.27198, "epoch": 0.00044, "grad_norm": 0.006146986037492752, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.0716, "kv_vq_loss": 0.001273, "learning_rate": 0.0006608631691215468, "loss": 0.072852, "step": 440, "value_mse_loss_layer_000": 0.001457, "value_mse_loss_layer_001": 0.00412, "value_mse_loss_layer_002": 0.015625, "value_mse_loss_layer_003": 0.025146, "value_mse_loss_layer_004": 0.022827, "value_mse_loss_layer_005": 0.02478, "value_mse_loss_layer_006": 0.026489, "value_mse_loss_layer_007": 0.030029, "value_mse_loss_layer_008": 0.034668, "value_mse_loss_layer_009": 0.044922, "value_mse_loss_layer_010": 0.039795, "value_mse_loss_layer_011": 0.04541, "value_mse_loss_layer_012": 0.045654, "value_mse_loss_layer_013": 0.045654, "value_mse_loss_layer_014": 0.050293, "value_mse_loss_layer_015": 0.049561, "value_mse_loss_layer_016": 0.046143, "value_mse_loss_layer_017": 0.048828, "value_mse_loss_layer_018": 0.047852, "value_mse_loss_layer_019": 0.060791, "value_mse_loss_layer_020": 0.05957, "value_mse_loss_layer_021": 0.077148, "value_mse_loss_layer_022": 0.064453, "value_mse_loss_layer_023": 0.082031, "value_mse_loss_layer_024": 0.083984, "value_mse_loss_layer_025": 0.109863, "value_mse_loss_layer_026": 0.084473, "value_mse_loss_layer_027": 0.113281, "value_mse_loss_layer_028": 0.112793, "value_mse_loss_layer_029": 0.166992, "value_mse_loss_layer_030": 0.15332, "value_mse_loss_layer_031": 0.192383, "vq_loss_layer_000": 2.3e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 7.2e-05, "vq_loss_layer_004": 0.000145, "vq_loss_layer_005": 0.000244, "vq_loss_layer_006": 0.000298, "vq_loss_layer_007": 0.000441, "vq_loss_layer_008": 0.000462, "vq_loss_layer_009": 0.000553, "vq_loss_layer_010": 0.000515, "vq_loss_layer_011": 0.000668, "vq_loss_layer_012": 0.000935, "vq_loss_layer_013": 0.000805, "vq_loss_layer_014": 0.001015, "vq_loss_layer_015": 0.000965, "vq_loss_layer_016": 0.001068, "vq_loss_layer_017": 0.000839, "vq_loss_layer_018": 0.000515, "vq_loss_layer_019": 0.000433, "vq_loss_layer_020": 0.000526, "vq_loss_layer_021": 0.001198, "vq_loss_layer_022": 0.000561, "vq_loss_layer_023": 0.000847, "vq_loss_layer_024": 0.000702, "vq_loss_layer_025": 0.000938, "vq_loss_layer_026": 0.001129, "vq_loss_layer_027": 0.001381, "vq_loss_layer_028": 0.001999, "vq_loss_layer_029": 0.003281, "vq_loss_layer_030": 0.004486, "vq_loss_layer_031": 0.011963 }, { "ce_loss": 2.283042, "epoch": 0.00045, "grad_norm": 0.007489431649446487, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.071143, "kv_vq_loss": 0.001269, "learning_rate": 0.0006633031284438359, "loss": 0.072375, "step": 450, "value_mse_loss_layer_000": 0.001358, "value_mse_loss_layer_001": 0.004028, "value_mse_loss_layer_002": 0.015564, "value_mse_loss_layer_003": 0.026611, "value_mse_loss_layer_004": 0.024536, "value_mse_loss_layer_005": 0.023804, "value_mse_loss_layer_006": 0.026001, "value_mse_loss_layer_007": 0.029541, "value_mse_loss_layer_008": 0.033691, "value_mse_loss_layer_009": 0.041504, "value_mse_loss_layer_010": 0.037598, "value_mse_loss_layer_011": 0.040771, "value_mse_loss_layer_012": 0.042725, "value_mse_loss_layer_013": 0.041992, "value_mse_loss_layer_014": 0.046875, "value_mse_loss_layer_015": 0.043945, "value_mse_loss_layer_016": 0.044189, "value_mse_loss_layer_017": 0.044189, "value_mse_loss_layer_018": 0.054932, "value_mse_loss_layer_019": 0.05835, "value_mse_loss_layer_020": 0.060791, "value_mse_loss_layer_021": 0.073242, "value_mse_loss_layer_022": 0.067383, "value_mse_loss_layer_023": 0.085938, "value_mse_loss_layer_024": 0.09668, "value_mse_loss_layer_025": 0.119141, "value_mse_loss_layer_026": 0.098145, "value_mse_loss_layer_027": 0.152344, "value_mse_loss_layer_028": 0.120117, "value_mse_loss_layer_029": 0.191406, "value_mse_loss_layer_030": 0.173828, "value_mse_loss_layer_031": 0.223633, "vq_loss_layer_000": 2.4e-05, "vq_loss_layer_001": 3e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 8.5e-05, "vq_loss_layer_004": 0.000154, "vq_loss_layer_005": 0.000202, "vq_loss_layer_006": 0.000292, "vq_loss_layer_007": 0.000431, "vq_loss_layer_008": 0.000475, "vq_loss_layer_009": 0.000561, "vq_loss_layer_010": 0.000561, "vq_loss_layer_011": 0.000591, "vq_loss_layer_012": 0.000896, "vq_loss_layer_013": 0.000725, "vq_loss_layer_014": 0.000935, "vq_loss_layer_015": 0.00087, "vq_loss_layer_016": 0.001045, "vq_loss_layer_017": 0.000717, "vq_loss_layer_018": 0.000713, "vq_loss_layer_019": 0.000465, "vq_loss_layer_020": 0.000443, "vq_loss_layer_021": 0.000889, "vq_loss_layer_022": 0.000515, "vq_loss_layer_023": 0.000603, "vq_loss_layer_024": 0.000767, "vq_loss_layer_025": 0.000931, "vq_loss_layer_026": 0.001251, "vq_loss_layer_027": 0.002182, "vq_loss_layer_028": 0.001884, "vq_loss_layer_029": 0.003815, "vq_loss_layer_030": 0.006073, "vq_loss_layer_031": 0.016602 }, { "ce_loss": 2.319347, "epoch": 0.00046, "grad_norm": 0.006619424559175968, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.08252, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.099121, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.108887, "key_mse_loss_layer_012": 0.082031, "key_mse_loss_layer_013": 0.141602, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.123047, "key_mse_loss_layer_016": 0.120605, "key_mse_loss_layer_017": 0.12207, "key_mse_loss_layer_018": 0.131836, "key_mse_loss_layer_019": 0.104492, "key_mse_loss_layer_020": 0.118652, "key_mse_loss_layer_021": 0.11377, "key_mse_loss_layer_022": 0.116211, "key_mse_loss_layer_023": 0.114258, "key_mse_loss_layer_024": 0.090332, "key_mse_loss_layer_025": 0.084961, "key_mse_loss_layer_026": 0.101562, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.102051, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.101074, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.070758, "kv_vq_loss": 0.001236, "learning_rate": 0.0006656894579203935, "loss": 0.071973, "step": 460, "value_mse_loss_layer_000": 0.001472, "value_mse_loss_layer_001": 0.00412, "value_mse_loss_layer_002": 0.016357, "value_mse_loss_layer_003": 0.027588, "value_mse_loss_layer_004": 0.026978, "value_mse_loss_layer_005": 0.025391, "value_mse_loss_layer_006": 0.028442, "value_mse_loss_layer_007": 0.032959, "value_mse_loss_layer_008": 0.03418, "value_mse_loss_layer_009": 0.045654, "value_mse_loss_layer_010": 0.041504, "value_mse_loss_layer_011": 0.043457, "value_mse_loss_layer_012": 0.048828, "value_mse_loss_layer_013": 0.046631, "value_mse_loss_layer_014": 0.054932, "value_mse_loss_layer_015": 0.045654, "value_mse_loss_layer_016": 0.043701, "value_mse_loss_layer_017": 0.048096, "value_mse_loss_layer_018": 0.049072, "value_mse_loss_layer_019": 0.055908, "value_mse_loss_layer_020": 0.060547, "value_mse_loss_layer_021": 0.083008, "value_mse_loss_layer_022": 0.059814, "value_mse_loss_layer_023": 0.074707, "value_mse_loss_layer_024": 0.083984, "value_mse_loss_layer_025": 0.119629, "value_mse_loss_layer_026": 0.092773, "value_mse_loss_layer_027": 0.117188, "value_mse_loss_layer_028": 0.106934, "value_mse_loss_layer_029": 0.165039, "value_mse_loss_layer_030": 0.166016, "value_mse_loss_layer_031": 0.211914, "vq_loss_layer_000": 2.5e-05, "vq_loss_layer_001": 5e-05, "vq_loss_layer_002": 5.6e-05, "vq_loss_layer_003": 0.000137, "vq_loss_layer_004": 0.000232, "vq_loss_layer_005": 0.00025, "vq_loss_layer_006": 0.00037, "vq_loss_layer_007": 0.000519, "vq_loss_layer_008": 0.00053, "vq_loss_layer_009": 0.000771, "vq_loss_layer_010": 0.00074, "vq_loss_layer_011": 0.000774, "vq_loss_layer_012": 0.001366, "vq_loss_layer_013": 0.000847, "vq_loss_layer_014": 0.001274, "vq_loss_layer_015": 0.00082, "vq_loss_layer_016": 0.000942, "vq_loss_layer_017": 0.000896, "vq_loss_layer_018": 0.00061, "vq_loss_layer_019": 0.000416, "vq_loss_layer_020": 0.00053, "vq_loss_layer_021": 0.00164, "vq_loss_layer_022": 0.000599, "vq_loss_layer_023": 0.000813, "vq_loss_layer_024": 0.000954, "vq_loss_layer_025": 0.001755, "vq_loss_layer_026": 0.001801, "vq_loss_layer_027": 0.002075, "vq_loss_layer_028": 0.002182, "vq_loss_layer_029": 0.00325, "vq_loss_layer_030": 0.006592, "vq_loss_layer_031": 0.018799 }, { "ce_loss": 2.342692, "epoch": 0.00047, "grad_norm": 0.0070615108124911785, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.070721, "kv_vq_loss": 0.00123, "learning_rate": 0.0006680244644839293, "loss": 0.071887, "step": 470, "value_mse_loss_layer_000": 0.001404, "value_mse_loss_layer_001": 0.003937, "value_mse_loss_layer_002": 0.015869, "value_mse_loss_layer_003": 0.026367, "value_mse_loss_layer_004": 0.02417, "value_mse_loss_layer_005": 0.02356, "value_mse_loss_layer_006": 0.030396, "value_mse_loss_layer_007": 0.030273, "value_mse_loss_layer_008": 0.033936, "value_mse_loss_layer_009": 0.044678, "value_mse_loss_layer_010": 0.039551, "value_mse_loss_layer_011": 0.041992, "value_mse_loss_layer_012": 0.044434, "value_mse_loss_layer_013": 0.046387, "value_mse_loss_layer_014": 0.04834, "value_mse_loss_layer_015": 0.046143, "value_mse_loss_layer_016": 0.043701, "value_mse_loss_layer_017": 0.04834, "value_mse_loss_layer_018": 0.047852, "value_mse_loss_layer_019": 0.05542, "value_mse_loss_layer_020": 0.060547, "value_mse_loss_layer_021": 0.071777, "value_mse_loss_layer_022": 0.061279, "value_mse_loss_layer_023": 0.083008, "value_mse_loss_layer_024": 0.08252, "value_mse_loss_layer_025": 0.11377, "value_mse_loss_layer_026": 0.089844, "value_mse_loss_layer_027": 0.111328, "value_mse_loss_layer_028": 0.117676, "value_mse_loss_layer_029": 0.166016, "value_mse_loss_layer_030": 0.154297, "value_mse_loss_layer_031": 0.204102, "vq_loss_layer_000": 2.2e-05, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 7e-05, "vq_loss_layer_004": 0.000171, "vq_loss_layer_005": 0.000192, "vq_loss_layer_006": 0.000492, "vq_loss_layer_007": 0.00046, "vq_loss_layer_008": 0.000483, "vq_loss_layer_009": 0.000641, "vq_loss_layer_010": 0.00058, "vq_loss_layer_011": 0.000587, "vq_loss_layer_012": 0.001045, "vq_loss_layer_013": 0.000839, "vq_loss_layer_014": 0.001015, "vq_loss_layer_015": 0.000969, "vq_loss_layer_016": 0.000969, "vq_loss_layer_017": 0.00087, "vq_loss_layer_018": 0.000504, "vq_loss_layer_019": 0.000393, "vq_loss_layer_020": 0.00053, "vq_loss_layer_021": 0.000992, "vq_loss_layer_022": 0.000546, "vq_loss_layer_023": 0.000809, "vq_loss_layer_024": 0.000603, "vq_loss_layer_025": 0.000904, "vq_loss_layer_026": 0.001305, "vq_loss_layer_027": 0.001282, "vq_loss_layer_028": 0.002121, "vq_loss_layer_029": 0.002686, "vq_loss_layer_030": 0.004883, "vq_loss_layer_031": 0.013062 }, { "ce_loss": 2.2831, "epoch": 0.00048, "grad_norm": 0.005912736523896456, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.040039, "key_mse_loss_layer_005": 0.053955, "key_mse_loss_layer_006": 0.061035, "key_mse_loss_layer_007": 0.070312, "key_mse_loss_layer_008": 0.078613, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.094238, "key_mse_loss_layer_011": 0.093262, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.07373, "key_mse_loss_layer_031": 0.056885, "kv_mse_loss": 0.070428, "kv_vq_loss": 0.001245, "learning_rate": 0.0006703103093438967, "loss": 0.071606, "step": 480, "value_mse_loss_layer_000": 0.001419, "value_mse_loss_layer_001": 0.003998, "value_mse_loss_layer_002": 0.015259, "value_mse_loss_layer_003": 0.02478, "value_mse_loss_layer_004": 0.023804, "value_mse_loss_layer_005": 0.024048, "value_mse_loss_layer_006": 0.026123, "value_mse_loss_layer_007": 0.030396, "value_mse_loss_layer_008": 0.033447, "value_mse_loss_layer_009": 0.04541, "value_mse_loss_layer_010": 0.039795, "value_mse_loss_layer_011": 0.041504, "value_mse_loss_layer_012": 0.043945, "value_mse_loss_layer_013": 0.046143, "value_mse_loss_layer_014": 0.050049, "value_mse_loss_layer_015": 0.047852, "value_mse_loss_layer_016": 0.044434, "value_mse_loss_layer_017": 0.046631, "value_mse_loss_layer_018": 0.04834, "value_mse_loss_layer_019": 0.05542, "value_mse_loss_layer_020": 0.061523, "value_mse_loss_layer_021": 0.077637, "value_mse_loss_layer_022": 0.062988, "value_mse_loss_layer_023": 0.080566, "value_mse_loss_layer_024": 0.085938, "value_mse_loss_layer_025": 0.111328, "value_mse_loss_layer_026": 0.097168, "value_mse_loss_layer_027": 0.136719, "value_mse_loss_layer_028": 0.125, "value_mse_loss_layer_029": 0.182617, "value_mse_loss_layer_030": 0.169922, "value_mse_loss_layer_031": 0.211914, "vq_loss_layer_000": 2.3e-05, "vq_loss_layer_001": 4.6e-05, "vq_loss_layer_002": 4.4e-05, "vq_loss_layer_003": 0.000113, "vq_loss_layer_004": 0.00016, "vq_loss_layer_005": 0.000231, "vq_loss_layer_006": 0.000309, "vq_loss_layer_007": 0.000471, "vq_loss_layer_008": 0.000511, "vq_loss_layer_009": 0.000706, "vq_loss_layer_010": 0.000683, "vq_loss_layer_011": 0.00061, "vq_loss_layer_012": 0.001007, "vq_loss_layer_013": 0.000927, "vq_loss_layer_014": 0.001038, "vq_loss_layer_015": 0.001038, "vq_loss_layer_016": 0.001183, "vq_loss_layer_017": 0.000774, "vq_loss_layer_018": 0.000546, "vq_loss_layer_019": 0.000492, "vq_loss_layer_020": 0.000523, "vq_loss_layer_021": 0.001411, "vq_loss_layer_022": 0.000725, "vq_loss_layer_023": 0.000717, "vq_loss_layer_024": 0.000603, "vq_loss_layer_025": 0.000916, "vq_loss_layer_026": 0.001427, "vq_loss_layer_027": 0.001984, "vq_loss_layer_028": 0.002426, "vq_loss_layer_029": 0.004181, "vq_loss_layer_030": 0.00528, "vq_loss_layer_031": 0.016724 }, { "ce_loss": 2.296941, "epoch": 0.00049, "grad_norm": 0.006452593486756086, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.070453, "kv_vq_loss": 0.00124, "learning_rate": 0.0006725490200071283, "loss": 0.071661, "step": 490, "value_mse_loss_layer_000": 0.001381, "value_mse_loss_layer_001": 0.003876, "value_mse_loss_layer_002": 0.015015, "value_mse_loss_layer_003": 0.027222, "value_mse_loss_layer_004": 0.023193, "value_mse_loss_layer_005": 0.023804, "value_mse_loss_layer_006": 0.025635, "value_mse_loss_layer_007": 0.029419, "value_mse_loss_layer_008": 0.032959, "value_mse_loss_layer_009": 0.042969, "value_mse_loss_layer_010": 0.038086, "value_mse_loss_layer_011": 0.041016, "value_mse_loss_layer_012": 0.042725, "value_mse_loss_layer_013": 0.043213, "value_mse_loss_layer_014": 0.048584, "value_mse_loss_layer_015": 0.04541, "value_mse_loss_layer_016": 0.052246, "value_mse_loss_layer_017": 0.046143, "value_mse_loss_layer_018": 0.047119, "value_mse_loss_layer_019": 0.056396, "value_mse_loss_layer_020": 0.059082, "value_mse_loss_layer_021": 0.070801, "value_mse_loss_layer_022": 0.060547, "value_mse_loss_layer_023": 0.078613, "value_mse_loss_layer_024": 0.09082, "value_mse_loss_layer_025": 0.117188, "value_mse_loss_layer_026": 0.088867, "value_mse_loss_layer_027": 0.112305, "value_mse_loss_layer_028": 0.109863, "value_mse_loss_layer_029": 0.163086, "value_mse_loss_layer_030": 0.150391, "value_mse_loss_layer_031": 0.197266, "vq_loss_layer_000": 2.1e-05, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 8.7e-05, "vq_loss_layer_004": 0.000151, "vq_loss_layer_005": 0.000201, "vq_loss_layer_006": 0.000273, "vq_loss_layer_007": 0.000422, "vq_loss_layer_008": 0.000448, "vq_loss_layer_009": 0.000587, "vq_loss_layer_010": 0.000534, "vq_loss_layer_011": 0.000568, "vq_loss_layer_012": 0.000946, "vq_loss_layer_013": 0.000687, "vq_loss_layer_014": 0.001015, "vq_loss_layer_015": 0.000809, "vq_loss_layer_016": 0.001228, "vq_loss_layer_017": 0.000717, "vq_loss_layer_018": 0.000477, "vq_loss_layer_019": 0.000401, "vq_loss_layer_020": 0.000454, "vq_loss_layer_021": 0.000889, "vq_loss_layer_022": 0.000439, "vq_loss_layer_023": 0.000652, "vq_loss_layer_024": 0.000679, "vq_loss_layer_025": 0.0009, "vq_loss_layer_026": 0.00116, "vq_loss_layer_027": 0.001282, "vq_loss_layer_028": 0.001686, "vq_loss_layer_029": 0.00267, "vq_loss_layer_030": 0.00386, "vq_loss_layer_031": 0.012024 }, { "ce_loss": 2.28463, "epoch": 0.0005, "grad_norm": 0.0066840858198702335, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.072266, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.069672, "kv_vq_loss": 0.00122, "learning_rate": 0.0006747425010840046, "loss": 0.070825, "step": 500, "value_mse_loss_layer_000": 0.001396, "value_mse_loss_layer_001": 0.00386, "value_mse_loss_layer_002": 0.015503, "value_mse_loss_layer_003": 0.024048, "value_mse_loss_layer_004": 0.022217, "value_mse_loss_layer_005": 0.021973, "value_mse_loss_layer_006": 0.025269, "value_mse_loss_layer_007": 0.028687, "value_mse_loss_layer_008": 0.032471, "value_mse_loss_layer_009": 0.043213, "value_mse_loss_layer_010": 0.037598, "value_mse_loss_layer_011": 0.041016, "value_mse_loss_layer_012": 0.043213, "value_mse_loss_layer_013": 0.043457, "value_mse_loss_layer_014": 0.045654, "value_mse_loss_layer_015": 0.046631, "value_mse_loss_layer_016": 0.043457, "value_mse_loss_layer_017": 0.045898, "value_mse_loss_layer_018": 0.045654, "value_mse_loss_layer_019": 0.053223, "value_mse_loss_layer_020": 0.055176, "value_mse_loss_layer_021": 0.070801, "value_mse_loss_layer_022": 0.060303, "value_mse_loss_layer_023": 0.076172, "value_mse_loss_layer_024": 0.076172, "value_mse_loss_layer_025": 0.107422, "value_mse_loss_layer_026": 0.081543, "value_mse_loss_layer_027": 0.100098, "value_mse_loss_layer_028": 0.097656, "value_mse_loss_layer_029": 0.148438, "value_mse_loss_layer_030": 0.131836, "value_mse_loss_layer_031": 0.1875, "vq_loss_layer_000": 2.1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 0.000145, "vq_loss_layer_005": 0.000185, "vq_loss_layer_006": 0.000284, "vq_loss_layer_007": 0.000418, "vq_loss_layer_008": 0.000416, "vq_loss_layer_009": 0.000557, "vq_loss_layer_010": 0.000484, "vq_loss_layer_011": 0.000526, "vq_loss_layer_012": 0.000938, "vq_loss_layer_013": 0.000713, "vq_loss_layer_014": 0.00082, "vq_loss_layer_015": 0.000862, "vq_loss_layer_016": 0.000984, "vq_loss_layer_017": 0.000801, "vq_loss_layer_018": 0.000484, "vq_loss_layer_019": 0.000374, "vq_loss_layer_020": 0.000454, "vq_loss_layer_021": 0.001091, "vq_loss_layer_022": 0.000553, "vq_loss_layer_023": 0.000843, "vq_loss_layer_024": 0.000675, "vq_loss_layer_025": 0.000919, "vq_loss_layer_026": 0.001366, "vq_loss_layer_027": 0.001228, "vq_loss_layer_028": 0.001358, "vq_loss_layer_029": 0.002457, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.012268 }, { "ce_loss": 2.306118, "epoch": 0.00051, "grad_norm": 0.009196331724524498, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.070288, "kv_vq_loss": 0.001225, "learning_rate": 0.000676892544024484, "loss": 0.07146, "step": 510, "value_mse_loss_layer_000": 0.001381, "value_mse_loss_layer_001": 0.003876, "value_mse_loss_layer_002": 0.015076, "value_mse_loss_layer_003": 0.024048, "value_mse_loss_layer_004": 0.023315, "value_mse_loss_layer_005": 0.022461, "value_mse_loss_layer_006": 0.0271, "value_mse_loss_layer_007": 0.028564, "value_mse_loss_layer_008": 0.032959, "value_mse_loss_layer_009": 0.044922, "value_mse_loss_layer_010": 0.047119, "value_mse_loss_layer_011": 0.04541, "value_mse_loss_layer_012": 0.042969, "value_mse_loss_layer_013": 0.048584, "value_mse_loss_layer_014": 0.047607, "value_mse_loss_layer_015": 0.047852, "value_mse_loss_layer_016": 0.045166, "value_mse_loss_layer_017": 0.046387, "value_mse_loss_layer_018": 0.048584, "value_mse_loss_layer_019": 0.057129, "value_mse_loss_layer_020": 0.05835, "value_mse_loss_layer_021": 0.071289, "value_mse_loss_layer_022": 0.061768, "value_mse_loss_layer_023": 0.079102, "value_mse_loss_layer_024": 0.083008, "value_mse_loss_layer_025": 0.109375, "value_mse_loss_layer_026": 0.083496, "value_mse_loss_layer_027": 0.10791, "value_mse_loss_layer_028": 0.108398, "value_mse_loss_layer_029": 0.200195, "value_mse_loss_layer_030": 0.145508, "value_mse_loss_layer_031": 0.19043, "vq_loss_layer_000": 2.1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4.4e-05, "vq_loss_layer_004": 0.000188, "vq_loss_layer_005": 0.000191, "vq_loss_layer_006": 0.000353, "vq_loss_layer_007": 0.000412, "vq_loss_layer_008": 0.000399, "vq_loss_layer_009": 0.000668, "vq_loss_layer_010": 0.000595, "vq_loss_layer_011": 0.000782, "vq_loss_layer_012": 0.000908, "vq_loss_layer_013": 0.000957, "vq_loss_layer_014": 0.000877, "vq_loss_layer_015": 0.000931, "vq_loss_layer_016": 0.000969, "vq_loss_layer_017": 0.000717, "vq_loss_layer_018": 0.0005, "vq_loss_layer_019": 0.000393, "vq_loss_layer_020": 0.000458, "vq_loss_layer_021": 0.000885, "vq_loss_layer_022": 0.000467, "vq_loss_layer_023": 0.000687, "vq_loss_layer_024": 0.000553, "vq_loss_layer_025": 0.000721, "vq_loss_layer_026": 0.000942, "vq_loss_layer_027": 0.001022, "vq_loss_layer_028": 0.001457, "vq_loss_layer_029": 0.00293, "vq_loss_layer_030": 0.00383, "vq_loss_layer_031": 0.01001 }, { "ce_loss": 2.296355, "epoch": 0.00052, "grad_norm": 0.0069217318668961525, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.069708, "kv_vq_loss": 0.001227, "learning_rate": 0.0006790008359086997, "loss": 0.070898, "step": 520, "value_mse_loss_layer_000": 0.001381, "value_mse_loss_layer_001": 0.003876, "value_mse_loss_layer_002": 0.015015, "value_mse_loss_layer_003": 0.024414, "value_mse_loss_layer_004": 0.022705, "value_mse_loss_layer_005": 0.021729, "value_mse_loss_layer_006": 0.025146, "value_mse_loss_layer_007": 0.027954, "value_mse_loss_layer_008": 0.032471, "value_mse_loss_layer_009": 0.043701, "value_mse_loss_layer_010": 0.037354, "value_mse_loss_layer_011": 0.04126, "value_mse_loss_layer_012": 0.043945, "value_mse_loss_layer_013": 0.04248, "value_mse_loss_layer_014": 0.045898, "value_mse_loss_layer_015": 0.045654, "value_mse_loss_layer_016": 0.04248, "value_mse_loss_layer_017": 0.044189, "value_mse_loss_layer_018": 0.047852, "value_mse_loss_layer_019": 0.053955, "value_mse_loss_layer_020": 0.055664, "value_mse_loss_layer_021": 0.101074, "value_mse_loss_layer_022": 0.064941, "value_mse_loss_layer_023": 0.077637, "value_mse_loss_layer_024": 0.086914, "value_mse_loss_layer_025": 0.106445, "value_mse_loss_layer_026": 0.083984, "value_mse_loss_layer_027": 0.108887, "value_mse_loss_layer_028": 0.106445, "value_mse_loss_layer_029": 0.172852, "value_mse_loss_layer_030": 0.139648, "value_mse_loss_layer_031": 0.19043, "vq_loss_layer_000": 2e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 6.3e-05, "vq_loss_layer_004": 0.000167, "vq_loss_layer_005": 0.000175, "vq_loss_layer_006": 0.000277, "vq_loss_layer_007": 0.000397, "vq_loss_layer_008": 0.000406, "vq_loss_layer_009": 0.000607, "vq_loss_layer_010": 0.000475, "vq_loss_layer_011": 0.000553, "vq_loss_layer_012": 0.00103, "vq_loss_layer_013": 0.000839, "vq_loss_layer_014": 0.000866, "vq_loss_layer_015": 0.00087, "vq_loss_layer_016": 0.000942, "vq_loss_layer_017": 0.000698, "vq_loss_layer_018": 0.000492, "vq_loss_layer_019": 0.000374, "vq_loss_layer_020": 0.000467, "vq_loss_layer_021": 0.00177, "vq_loss_layer_022": 0.000572, "vq_loss_layer_023": 0.000767, "vq_loss_layer_024": 0.000717, "vq_loss_layer_025": 0.000809, "vq_loss_layer_026": 0.001167, "vq_loss_layer_027": 0.001297, "vq_loss_layer_028": 0.001595, "vq_loss_layer_029": 0.003387, "vq_loss_layer_030": 0.004272, "vq_loss_layer_031": 0.011292 }, { "ce_loss": 2.314534, "epoch": 0.00053, "grad_norm": 0.005528532434254885, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.069476, "kv_vq_loss": 0.001206, "learning_rate": 0.0006810689674001972, "loss": 0.070624, "step": 530, "value_mse_loss_layer_000": 0.001396, "value_mse_loss_layer_001": 0.003891, "value_mse_loss_layer_002": 0.014954, "value_mse_loss_layer_003": 0.025024, "value_mse_loss_layer_004": 0.022705, "value_mse_loss_layer_005": 0.022949, "value_mse_loss_layer_006": 0.025269, "value_mse_loss_layer_007": 0.028931, "value_mse_loss_layer_008": 0.033691, "value_mse_loss_layer_009": 0.041992, "value_mse_loss_layer_010": 0.038574, "value_mse_loss_layer_011": 0.040527, "value_mse_loss_layer_012": 0.043213, "value_mse_loss_layer_013": 0.043213, "value_mse_loss_layer_014": 0.046875, "value_mse_loss_layer_015": 0.046387, "value_mse_loss_layer_016": 0.047607, "value_mse_loss_layer_017": 0.04541, "value_mse_loss_layer_018": 0.047119, "value_mse_loss_layer_019": 0.059326, "value_mse_loss_layer_020": 0.058594, "value_mse_loss_layer_021": 0.082031, "value_mse_loss_layer_022": 0.061768, "value_mse_loss_layer_023": 0.079102, "value_mse_loss_layer_024": 0.088867, "value_mse_loss_layer_025": 0.104492, "value_mse_loss_layer_026": 0.086914, "value_mse_loss_layer_027": 0.118164, "value_mse_loss_layer_028": 0.109863, "value_mse_loss_layer_029": 0.171875, "value_mse_loss_layer_030": 0.151367, "value_mse_loss_layer_031": 0.201172, "vq_loss_layer_000": 2.2e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 6.6e-05, "vq_loss_layer_004": 0.000136, "vq_loss_layer_005": 0.000194, "vq_loss_layer_006": 0.000278, "vq_loss_layer_007": 0.000429, "vq_loss_layer_008": 0.000471, "vq_loss_layer_009": 0.000534, "vq_loss_layer_010": 0.000553, "vq_loss_layer_011": 0.000549, "vq_loss_layer_012": 0.000973, "vq_loss_layer_013": 0.000736, "vq_loss_layer_014": 0.000889, "vq_loss_layer_015": 0.0009, "vq_loss_layer_016": 0.001228, "vq_loss_layer_017": 0.000877, "vq_loss_layer_018": 0.000463, "vq_loss_layer_019": 0.000441, "vq_loss_layer_020": 0.000427, "vq_loss_layer_021": 0.001228, "vq_loss_layer_022": 0.000507, "vq_loss_layer_023": 0.00066, "vq_loss_layer_024": 0.000774, "vq_loss_layer_025": 0.000721, "vq_loss_layer_026": 0.001083, "vq_loss_layer_027": 0.001389, "vq_loss_layer_028": 0.001617, "vq_loss_layer_029": 0.003159, "vq_loss_layer_030": 0.004578, "vq_loss_layer_031": 0.012451 }, { "ce_loss": 2.311853, "epoch": 0.00054, "grad_norm": 0.009179170243442059, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.070123, "kv_vq_loss": 0.001241, "learning_rate": 0.000683098439955742, "loss": 0.071326, "step": 540, "value_mse_loss_layer_000": 0.001312, "value_mse_loss_layer_001": 0.003738, "value_mse_loss_layer_002": 0.016724, "value_mse_loss_layer_003": 0.024414, "value_mse_loss_layer_004": 0.022217, "value_mse_loss_layer_005": 0.022095, "value_mse_loss_layer_006": 0.02478, "value_mse_loss_layer_007": 0.028809, "value_mse_loss_layer_008": 0.031982, "value_mse_loss_layer_009": 0.043213, "value_mse_loss_layer_010": 0.036865, "value_mse_loss_layer_011": 0.040039, "value_mse_loss_layer_012": 0.040771, "value_mse_loss_layer_013": 0.04126, "value_mse_loss_layer_014": 0.046875, "value_mse_loss_layer_015": 0.04248, "value_mse_loss_layer_016": 0.04126, "value_mse_loss_layer_017": 0.041992, "value_mse_loss_layer_018": 0.049805, "value_mse_loss_layer_019": 0.052979, "value_mse_loss_layer_020": 0.053223, "value_mse_loss_layer_021": 0.06543, "value_mse_loss_layer_022": 0.056885, "value_mse_loss_layer_023": 0.072266, "value_mse_loss_layer_024": 0.081543, "value_mse_loss_layer_025": 0.095703, "value_mse_loss_layer_026": 0.082031, "value_mse_loss_layer_027": 0.123535, "value_mse_loss_layer_028": 0.112793, "value_mse_loss_layer_029": 0.179688, "value_mse_loss_layer_030": 0.140625, "value_mse_loss_layer_031": 0.194336, "vq_loss_layer_000": 2.1e-05, "vq_loss_layer_001": 2.4e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 7.3e-05, "vq_loss_layer_004": 0.000137, "vq_loss_layer_005": 0.000175, "vq_loss_layer_006": 0.000277, "vq_loss_layer_007": 0.000463, "vq_loss_layer_008": 0.000456, "vq_loss_layer_009": 0.000652, "vq_loss_layer_010": 0.000534, "vq_loss_layer_011": 0.000603, "vq_loss_layer_012": 0.000896, "vq_loss_layer_013": 0.000637, "vq_loss_layer_014": 0.001022, "vq_loss_layer_015": 0.000835, "vq_loss_layer_016": 0.000999, "vq_loss_layer_017": 0.000736, "vq_loss_layer_018": 0.000587, "vq_loss_layer_019": 0.000404, "vq_loss_layer_020": 0.000441, "vq_loss_layer_021": 0.000973, "vq_loss_layer_022": 0.000492, "vq_loss_layer_023": 0.00069, "vq_loss_layer_024": 0.000732, "vq_loss_layer_025": 0.000786, "vq_loss_layer_026": 0.001297, "vq_loss_layer_027": 0.002243, "vq_loss_layer_028": 0.002396, "vq_loss_layer_029": 0.003464, "vq_loss_layer_030": 0.004242, "vq_loss_layer_031": 0.01355 }, { "ce_loss": 2.227315, "epoch": 0.00055, "grad_norm": 0.00563665060326457, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.055908, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.070477, "kv_vq_loss": 0.001267, "learning_rate": 0.0006850906723735609, "loss": 0.07171, "step": 550, "value_mse_loss_layer_000": 0.001335, "value_mse_loss_layer_001": 0.003769, "value_mse_loss_layer_002": 0.014709, "value_mse_loss_layer_003": 0.024292, "value_mse_loss_layer_004": 0.023804, "value_mse_loss_layer_005": 0.022461, "value_mse_loss_layer_006": 0.024658, "value_mse_loss_layer_007": 0.027588, "value_mse_loss_layer_008": 0.033447, "value_mse_loss_layer_009": 0.040283, "value_mse_loss_layer_010": 0.036621, "value_mse_loss_layer_011": 0.040283, "value_mse_loss_layer_012": 0.060303, "value_mse_loss_layer_013": 0.041504, "value_mse_loss_layer_014": 0.04541, "value_mse_loss_layer_015": 0.043457, "value_mse_loss_layer_016": 0.040527, "value_mse_loss_layer_017": 0.040039, "value_mse_loss_layer_018": 0.047363, "value_mse_loss_layer_019": 0.05127, "value_mse_loss_layer_020": 0.05249, "value_mse_loss_layer_021": 0.064941, "value_mse_loss_layer_022": 0.059814, "value_mse_loss_layer_023": 0.070801, "value_mse_loss_layer_024": 0.091309, "value_mse_loss_layer_025": 0.091309, "value_mse_loss_layer_026": 0.080078, "value_mse_loss_layer_027": 0.10791, "value_mse_loss_layer_028": 0.099609, "value_mse_loss_layer_029": 0.154297, "value_mse_loss_layer_030": 0.141602, "value_mse_loss_layer_031": 0.195312, "vq_loss_layer_000": 2.1e-05, "vq_loss_layer_001": 2.9e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 0.000176, "vq_loss_layer_005": 0.000211, "vq_loss_layer_006": 0.000267, "vq_loss_layer_007": 0.000389, "vq_loss_layer_008": 0.000591, "vq_loss_layer_009": 0.000557, "vq_loss_layer_010": 0.000599, "vq_loss_layer_011": 0.000618, "vq_loss_layer_012": 0.002609, "vq_loss_layer_013": 0.000755, "vq_loss_layer_014": 0.001007, "vq_loss_layer_015": 0.000999, "vq_loss_layer_016": 0.00106, "vq_loss_layer_017": 0.000706, "vq_loss_layer_018": 0.000523, "vq_loss_layer_019": 0.000477, "vq_loss_layer_020": 0.000473, "vq_loss_layer_021": 0.000984, "vq_loss_layer_022": 0.000614, "vq_loss_layer_023": 0.000706, "vq_loss_layer_024": 0.000866, "vq_loss_layer_025": 0.00095, "vq_loss_layer_026": 0.001358, "vq_loss_layer_027": 0.001541, "vq_loss_layer_028": 0.001862, "vq_loss_layer_029": 0.003403, "vq_loss_layer_030": 0.005707, "vq_loss_layer_031": 0.014709 }, { "ce_loss": 2.286748, "epoch": 0.00056, "grad_norm": 0.0062910341657698154, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.068994, "kv_vq_loss": 0.001184, "learning_rate": 0.0006870470067515499, "loss": 0.070148, "step": 560, "value_mse_loss_layer_000": 0.001305, "value_mse_loss_layer_001": 0.003738, "value_mse_loss_layer_002": 0.014709, "value_mse_loss_layer_003": 0.024414, "value_mse_loss_layer_004": 0.023315, "value_mse_loss_layer_005": 0.022827, "value_mse_loss_layer_006": 0.025024, "value_mse_loss_layer_007": 0.028687, "value_mse_loss_layer_008": 0.032959, "value_mse_loss_layer_009": 0.042725, "value_mse_loss_layer_010": 0.036865, "value_mse_loss_layer_011": 0.038818, "value_mse_loss_layer_012": 0.04248, "value_mse_loss_layer_013": 0.040283, "value_mse_loss_layer_014": 0.044922, "value_mse_loss_layer_015": 0.041504, "value_mse_loss_layer_016": 0.041504, "value_mse_loss_layer_017": 0.041992, "value_mse_loss_layer_018": 0.049316, "value_mse_loss_layer_019": 0.05542, "value_mse_loss_layer_020": 0.058838, "value_mse_loss_layer_021": 0.068848, "value_mse_loss_layer_022": 0.062012, "value_mse_loss_layer_023": 0.089844, "value_mse_loss_layer_024": 0.098145, "value_mse_loss_layer_025": 0.120605, "value_mse_loss_layer_026": 0.100586, "value_mse_loss_layer_027": 0.12793, "value_mse_loss_layer_028": 0.117676, "value_mse_loss_layer_029": 0.175781, "value_mse_loss_layer_030": 0.175781, "value_mse_loss_layer_031": 0.213867, "vq_loss_layer_000": 2.3e-05, "vq_loss_layer_001": 3.7e-05, "vq_loss_layer_002": 3.2e-05, "vq_loss_layer_003": 7.3e-05, "vq_loss_layer_004": 0.000147, "vq_loss_layer_005": 0.000179, "vq_loss_layer_006": 0.000256, "vq_loss_layer_007": 0.000368, "vq_loss_layer_008": 0.000471, "vq_loss_layer_009": 0.000618, "vq_loss_layer_010": 0.000519, "vq_loss_layer_011": 0.000515, "vq_loss_layer_012": 0.000957, "vq_loss_layer_013": 0.000664, "vq_loss_layer_014": 0.000858, "vq_loss_layer_015": 0.000774, "vq_loss_layer_016": 0.000992, "vq_loss_layer_017": 0.000626, "vq_loss_layer_018": 0.000668, "vq_loss_layer_019": 0.000456, "vq_loss_layer_020": 0.000366, "vq_loss_layer_021": 0.00071, "vq_loss_layer_022": 0.000462, "vq_loss_layer_023": 0.000626, "vq_loss_layer_024": 0.00069, "vq_loss_layer_025": 0.0009, "vq_loss_layer_026": 0.001205, "vq_loss_layer_027": 0.001167, "vq_loss_layer_028": 0.001831, "vq_loss_layer_029": 0.003677, "vq_loss_layer_030": 0.005157, "vq_loss_layer_031": 0.016113 }, { "ce_loss": 2.304357, "epoch": 0.00057, "grad_norm": 0.007892019115388393, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.068817, "kv_vq_loss": 0.001165, "learning_rate": 0.0006889687139181227, "loss": 0.069934, "step": 570, "value_mse_loss_layer_000": 0.001366, "value_mse_loss_layer_001": 0.003708, "value_mse_loss_layer_002": 0.014343, "value_mse_loss_layer_003": 0.023438, "value_mse_loss_layer_004": 0.020264, "value_mse_loss_layer_005": 0.020508, "value_mse_loss_layer_006": 0.024536, "value_mse_loss_layer_007": 0.027222, "value_mse_loss_layer_008": 0.031738, "value_mse_loss_layer_009": 0.04248, "value_mse_loss_layer_010": 0.036621, "value_mse_loss_layer_011": 0.040283, "value_mse_loss_layer_012": 0.041992, "value_mse_loss_layer_013": 0.041016, "value_mse_loss_layer_014": 0.048096, "value_mse_loss_layer_015": 0.04541, "value_mse_loss_layer_016": 0.041748, "value_mse_loss_layer_017": 0.044678, "value_mse_loss_layer_018": 0.045166, "value_mse_loss_layer_019": 0.052246, "value_mse_loss_layer_020": 0.055176, "value_mse_loss_layer_021": 0.068848, "value_mse_loss_layer_022": 0.059082, "value_mse_loss_layer_023": 0.076172, "value_mse_loss_layer_024": 0.077148, "value_mse_loss_layer_025": 0.098633, "value_mse_loss_layer_026": 0.08252, "value_mse_loss_layer_027": 0.109863, "value_mse_loss_layer_028": 0.104492, "value_mse_loss_layer_029": 0.175781, "value_mse_loss_layer_030": 0.137695, "value_mse_loss_layer_031": 0.181641, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 0.000124, "vq_loss_layer_005": 0.000153, "vq_loss_layer_006": 0.000269, "vq_loss_layer_007": 0.00038, "vq_loss_layer_008": 0.000378, "vq_loss_layer_009": 0.000629, "vq_loss_layer_010": 0.000454, "vq_loss_layer_011": 0.000546, "vq_loss_layer_012": 0.000919, "vq_loss_layer_013": 0.000652, "vq_loss_layer_014": 0.001015, "vq_loss_layer_015": 0.000877, "vq_loss_layer_016": 0.000824, "vq_loss_layer_017": 0.000744, "vq_loss_layer_018": 0.000458, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.000427, "vq_loss_layer_021": 0.000923, "vq_loss_layer_022": 0.000418, "vq_loss_layer_023": 0.000706, "vq_loss_layer_024": 0.000443, "vq_loss_layer_025": 0.000679, "vq_loss_layer_026": 0.001099, "vq_loss_layer_027": 0.001221, "vq_loss_layer_028": 0.001457, "vq_loss_layer_029": 0.002502, "vq_loss_layer_030": 0.003418, "vq_loss_layer_031": 0.009888 }, { "ce_loss": 2.348252, "epoch": 0.00058, "grad_norm": 0.006522058043628931, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.069043, "kv_vq_loss": 0.001195, "learning_rate": 0.0006908569983907342, "loss": 0.070184, "step": 580, "value_mse_loss_layer_000": 0.001389, "value_mse_loss_layer_001": 0.003738, "value_mse_loss_layer_002": 0.014465, "value_mse_loss_layer_003": 0.023682, "value_mse_loss_layer_004": 0.020752, "value_mse_loss_layer_005": 0.022583, "value_mse_loss_layer_006": 0.024292, "value_mse_loss_layer_007": 0.027466, "value_mse_loss_layer_008": 0.031128, "value_mse_loss_layer_009": 0.040527, "value_mse_loss_layer_010": 0.035889, "value_mse_loss_layer_011": 0.038574, "value_mse_loss_layer_012": 0.039551, "value_mse_loss_layer_013": 0.04126, "value_mse_loss_layer_014": 0.043457, "value_mse_loss_layer_015": 0.043701, "value_mse_loss_layer_016": 0.044189, "value_mse_loss_layer_017": 0.043213, "value_mse_loss_layer_018": 0.043213, "value_mse_loss_layer_019": 0.050293, "value_mse_loss_layer_020": 0.054688, "value_mse_loss_layer_021": 0.070312, "value_mse_loss_layer_022": 0.060303, "value_mse_loss_layer_023": 0.07373, "value_mse_loss_layer_024": 0.078125, "value_mse_loss_layer_025": 0.106934, "value_mse_loss_layer_026": 0.083008, "value_mse_loss_layer_027": 0.11084, "value_mse_loss_layer_028": 0.101562, "value_mse_loss_layer_029": 0.15332, "value_mse_loss_layer_030": 0.133789, "value_mse_loss_layer_031": 0.182617, "vq_loss_layer_000": 2e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 5.4e-05, "vq_loss_layer_004": 0.000128, "vq_loss_layer_005": 0.000202, "vq_loss_layer_006": 0.000271, "vq_loss_layer_007": 0.000389, "vq_loss_layer_008": 0.000364, "vq_loss_layer_009": 0.000504, "vq_loss_layer_010": 0.000433, "vq_loss_layer_011": 0.000469, "vq_loss_layer_012": 0.000813, "vq_loss_layer_013": 0.000702, "vq_loss_layer_014": 0.000809, "vq_loss_layer_015": 0.00079, "vq_loss_layer_016": 0.001038, "vq_loss_layer_017": 0.000656, "vq_loss_layer_018": 0.000383, "vq_loss_layer_019": 0.000326, "vq_loss_layer_020": 0.000429, "vq_loss_layer_021": 0.001015, "vq_loss_layer_022": 0.000475, "vq_loss_layer_023": 0.000648, "vq_loss_layer_024": 0.000515, "vq_loss_layer_025": 0.000732, "vq_loss_layer_026": 0.001129, "vq_loss_layer_027": 0.001251, "vq_loss_layer_028": 0.001419, "vq_loss_layer_029": 0.002609, "vq_loss_layer_030": 0.003571, "vq_loss_layer_031": 0.010559 }, { "ce_loss": 2.26809, "epoch": 0.00059, "grad_norm": 0.00536474771797657, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.069238, "kv_vq_loss": 0.001199, "learning_rate": 0.0006927130029105359, "loss": 0.070416, "step": 590, "value_mse_loss_layer_000": 0.001373, "value_mse_loss_layer_001": 0.003738, "value_mse_loss_layer_002": 0.015076, "value_mse_loss_layer_003": 0.024536, "value_mse_loss_layer_004": 0.022217, "value_mse_loss_layer_005": 0.022095, "value_mse_loss_layer_006": 0.02478, "value_mse_loss_layer_007": 0.02771, "value_mse_loss_layer_008": 0.031982, "value_mse_loss_layer_009": 0.040283, "value_mse_loss_layer_010": 0.037598, "value_mse_loss_layer_011": 0.041016, "value_mse_loss_layer_012": 0.040283, "value_mse_loss_layer_013": 0.041016, "value_mse_loss_layer_014": 0.044678, "value_mse_loss_layer_015": 0.044189, "value_mse_loss_layer_016": 0.04126, "value_mse_loss_layer_017": 0.045898, "value_mse_loss_layer_018": 0.045166, "value_mse_loss_layer_019": 0.05835, "value_mse_loss_layer_020": 0.058105, "value_mse_loss_layer_021": 0.068848, "value_mse_loss_layer_022": 0.060059, "value_mse_loss_layer_023": 0.072266, "value_mse_loss_layer_024": 0.078125, "value_mse_loss_layer_025": 0.101074, "value_mse_loss_layer_026": 0.083984, "value_mse_loss_layer_027": 0.109863, "value_mse_loss_layer_028": 0.10791, "value_mse_loss_layer_029": 0.155273, "value_mse_loss_layer_030": 0.148438, "value_mse_loss_layer_031": 0.196289, "vq_loss_layer_000": 2.1e-05, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 0.000168, "vq_loss_layer_005": 0.000184, "vq_loss_layer_006": 0.000278, "vq_loss_layer_007": 0.000414, "vq_loss_layer_008": 0.000441, "vq_loss_layer_009": 0.000519, "vq_loss_layer_010": 0.00053, "vq_loss_layer_011": 0.000591, "vq_loss_layer_012": 0.000824, "vq_loss_layer_013": 0.000671, "vq_loss_layer_014": 0.000854, "vq_loss_layer_015": 0.0009, "vq_loss_layer_016": 0.00087, "vq_loss_layer_017": 0.000851, "vq_loss_layer_018": 0.000504, "vq_loss_layer_019": 0.000454, "vq_loss_layer_020": 0.00046, "vq_loss_layer_021": 0.000931, "vq_loss_layer_022": 0.000507, "vq_loss_layer_023": 0.000622, "vq_loss_layer_024": 0.000538, "vq_loss_layer_025": 0.000809, "vq_loss_layer_026": 0.001144, "vq_loss_layer_027": 0.001259, "vq_loss_layer_028": 0.001785, "vq_loss_layer_029": 0.003113, "vq_loss_layer_030": 0.004425, "vq_loss_layer_031": 0.012695 }, { "ce_loss": 2.291785, "epoch": 0.0006, "grad_norm": 0.007603843696415424, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.053711, "key_mse_loss_layer_004": 0.062988, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.06889, "kv_vq_loss": 0.001162, "learning_rate": 0.0006945378125959108, "loss": 0.070032, "step": 600, "value_mse_loss_layer_000": 0.001366, "value_mse_loss_layer_001": 0.003677, "value_mse_loss_layer_002": 0.014465, "value_mse_loss_layer_003": 0.023682, "value_mse_loss_layer_004": 0.020508, "value_mse_loss_layer_005": 0.020264, "value_mse_loss_layer_006": 0.023438, "value_mse_loss_layer_007": 0.026611, "value_mse_loss_layer_008": 0.030762, "value_mse_loss_layer_009": 0.040039, "value_mse_loss_layer_010": 0.036133, "value_mse_loss_layer_011": 0.039307, "value_mse_loss_layer_012": 0.045898, "value_mse_loss_layer_013": 0.041748, "value_mse_loss_layer_014": 0.044434, "value_mse_loss_layer_015": 0.043945, "value_mse_loss_layer_016": 0.041016, "value_mse_loss_layer_017": 0.043945, "value_mse_loss_layer_018": 0.043945, "value_mse_loss_layer_019": 0.052002, "value_mse_loss_layer_020": 0.054688, "value_mse_loss_layer_021": 0.069336, "value_mse_loss_layer_022": 0.0625, "value_mse_loss_layer_023": 0.082031, "value_mse_loss_layer_024": 0.077148, "value_mse_loss_layer_025": 0.102051, "value_mse_loss_layer_026": 0.07666, "value_mse_loss_layer_027": 0.10791, "value_mse_loss_layer_028": 0.09668, "value_mse_loss_layer_029": 0.185547, "value_mse_loss_layer_030": 0.131836, "value_mse_loss_layer_031": 0.173828, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 4.4e-05, "vq_loss_layer_004": 0.000132, "vq_loss_layer_005": 0.000152, "vq_loss_layer_006": 0.000246, "vq_loss_layer_007": 0.000402, "vq_loss_layer_008": 0.000366, "vq_loss_layer_009": 0.000492, "vq_loss_layer_010": 0.000439, "vq_loss_layer_011": 0.000511, "vq_loss_layer_012": 0.001244, "vq_loss_layer_013": 0.000755, "vq_loss_layer_014": 0.000813, "vq_loss_layer_015": 0.000935, "vq_loss_layer_016": 0.000866, "vq_loss_layer_017": 0.000946, "vq_loss_layer_018": 0.000435, "vq_loss_layer_019": 0.000395, "vq_loss_layer_020": 0.000462, "vq_loss_layer_021": 0.000904, "vq_loss_layer_022": 0.000538, "vq_loss_layer_023": 0.000786, "vq_loss_layer_024": 0.000526, "vq_loss_layer_025": 0.000706, "vq_loss_layer_026": 0.000992, "vq_loss_layer_027": 0.001289, "vq_loss_layer_028": 0.001244, "vq_loss_layer_029": 0.003296, "vq_loss_layer_030": 0.00386, "vq_loss_layer_031": 0.009583 }, { "ce_loss": 2.258265, "epoch": 0.00061, "grad_norm": 0.006390747148543596, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.069171, "kv_vq_loss": 0.001197, "learning_rate": 0.0006963324587526917, "loss": 0.070319, "step": 610, "value_mse_loss_layer_000": 0.00132, "value_mse_loss_layer_001": 0.003555, "value_mse_loss_layer_002": 0.014343, "value_mse_loss_layer_003": 0.02356, "value_mse_loss_layer_004": 0.021729, "value_mse_loss_layer_005": 0.021729, "value_mse_loss_layer_006": 0.0271, "value_mse_loss_layer_007": 0.028198, "value_mse_loss_layer_008": 0.03125, "value_mse_loss_layer_009": 0.041748, "value_mse_loss_layer_010": 0.038086, "value_mse_loss_layer_011": 0.038818, "value_mse_loss_layer_012": 0.040527, "value_mse_loss_layer_013": 0.041748, "value_mse_loss_layer_014": 0.044434, "value_mse_loss_layer_015": 0.043213, "value_mse_loss_layer_016": 0.040039, "value_mse_loss_layer_017": 0.044434, "value_mse_loss_layer_018": 0.045898, "value_mse_loss_layer_019": 0.052246, "value_mse_loss_layer_020": 0.054443, "value_mse_loss_layer_021": 0.066895, "value_mse_loss_layer_022": 0.055664, "value_mse_loss_layer_023": 0.068848, "value_mse_loss_layer_024": 0.076172, "value_mse_loss_layer_025": 0.099609, "value_mse_loss_layer_026": 0.08252, "value_mse_loss_layer_027": 0.100098, "value_mse_loss_layer_028": 0.095703, "value_mse_loss_layer_029": 0.150391, "value_mse_loss_layer_030": 0.134766, "value_mse_loss_layer_031": 0.183594, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 2.8e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 7.4e-05, "vq_loss_layer_004": 0.000148, "vq_loss_layer_005": 0.000179, "vq_loss_layer_006": 0.000465, "vq_loss_layer_007": 0.000425, "vq_loss_layer_008": 0.000448, "vq_loss_layer_009": 0.000622, "vq_loss_layer_010": 0.00058, "vq_loss_layer_011": 0.00053, "vq_loss_layer_012": 0.00087, "vq_loss_layer_013": 0.000725, "vq_loss_layer_014": 0.000923, "vq_loss_layer_015": 0.000847, "vq_loss_layer_016": 0.000969, "vq_loss_layer_017": 0.000946, "vq_loss_layer_018": 0.000565, "vq_loss_layer_019": 0.00053, "vq_loss_layer_020": 0.000595, "vq_loss_layer_021": 0.00132, "vq_loss_layer_022": 0.000595, "vq_loss_layer_023": 0.000751, "vq_loss_layer_024": 0.000732, "vq_loss_layer_025": 0.001053, "vq_loss_layer_026": 0.001709, "vq_loss_layer_027": 0.001541, "vq_loss_layer_028": 0.00193, "vq_loss_layer_029": 0.003128, "vq_loss_layer_030": 0.004761, "vq_loss_layer_031": 0.014221 }, { "ce_loss": 2.344737, "epoch": 0.00062, "grad_norm": 0.005467478651553392, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.115723, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.103027, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.093262, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.068121, "kv_vq_loss": 0.001139, "learning_rate": 0.0006980979223745634, "loss": 0.06922, "step": 620, "value_mse_loss_layer_000": 0.001266, "value_mse_loss_layer_001": 0.003494, "value_mse_loss_layer_002": 0.014282, "value_mse_loss_layer_003": 0.023071, "value_mse_loss_layer_004": 0.022461, "value_mse_loss_layer_005": 0.021973, "value_mse_loss_layer_006": 0.024048, "value_mse_loss_layer_007": 0.027222, "value_mse_loss_layer_008": 0.030518, "value_mse_loss_layer_009": 0.03833, "value_mse_loss_layer_010": 0.035156, "value_mse_loss_layer_011": 0.037598, "value_mse_loss_layer_012": 0.038818, "value_mse_loss_layer_013": 0.04248, "value_mse_loss_layer_014": 0.04126, "value_mse_loss_layer_015": 0.039062, "value_mse_loss_layer_016": 0.037109, "value_mse_loss_layer_017": 0.040527, "value_mse_loss_layer_018": 0.042236, "value_mse_loss_layer_019": 0.046875, "value_mse_loss_layer_020": 0.049072, "value_mse_loss_layer_021": 0.071289, "value_mse_loss_layer_022": 0.051025, "value_mse_loss_layer_023": 0.0625, "value_mse_loss_layer_024": 0.070312, "value_mse_loss_layer_025": 0.089355, "value_mse_loss_layer_026": 0.071777, "value_mse_loss_layer_027": 0.096191, "value_mse_loss_layer_028": 0.088379, "value_mse_loss_layer_029": 0.133789, "value_mse_loss_layer_030": 0.125, "value_mse_loss_layer_031": 0.179688, "vq_loss_layer_000": 2e-05, "vq_loss_layer_001": 3.7e-05, "vq_loss_layer_002": 4.7e-05, "vq_loss_layer_003": 8.1e-05, "vq_loss_layer_004": 0.000189, "vq_loss_layer_005": 0.000217, "vq_loss_layer_006": 0.000299, "vq_loss_layer_007": 0.000418, "vq_loss_layer_008": 0.000534, "vq_loss_layer_009": 0.000542, "vq_loss_layer_010": 0.000622, "vq_loss_layer_011": 0.000599, "vq_loss_layer_012": 0.000916, "vq_loss_layer_013": 0.000904, "vq_loss_layer_014": 0.000916, "vq_loss_layer_015": 0.000744, "vq_loss_layer_016": 0.000881, "vq_loss_layer_017": 0.000706, "vq_loss_layer_018": 0.0005, "vq_loss_layer_019": 0.000385, "vq_loss_layer_020": 0.000448, "vq_loss_layer_021": 0.001465, "vq_loss_layer_022": 0.000504, "vq_loss_layer_023": 0.000702, "vq_loss_layer_024": 0.000702, "vq_loss_layer_025": 0.001083, "vq_loss_layer_026": 0.001282, "vq_loss_layer_027": 0.001404, "vq_loss_layer_028": 0.002167, "vq_loss_layer_029": 0.002869, "vq_loss_layer_030": 0.005005, "vq_loss_layer_031": 0.015137 }, { "ce_loss": 2.280637, "epoch": 0.00063, "grad_norm": 0.006097069475799799, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.068347, "kv_vq_loss": 0.001163, "learning_rate": 0.0006998351373633953, "loss": 0.069458, "step": 630, "value_mse_loss_layer_000": 0.001335, "value_mse_loss_layer_001": 0.003601, "value_mse_loss_layer_002": 0.016357, "value_mse_loss_layer_003": 0.022949, "value_mse_loss_layer_004": 0.020508, "value_mse_loss_layer_005": 0.020752, "value_mse_loss_layer_006": 0.023926, "value_mse_loss_layer_007": 0.027466, "value_mse_loss_layer_008": 0.031494, "value_mse_loss_layer_009": 0.04126, "value_mse_loss_layer_010": 0.038086, "value_mse_loss_layer_011": 0.039307, "value_mse_loss_layer_012": 0.038818, "value_mse_loss_layer_013": 0.041016, "value_mse_loss_layer_014": 0.044434, "value_mse_loss_layer_015": 0.043457, "value_mse_loss_layer_016": 0.039307, "value_mse_loss_layer_017": 0.043457, "value_mse_loss_layer_018": 0.041992, "value_mse_loss_layer_019": 0.051758, "value_mse_loss_layer_020": 0.053467, "value_mse_loss_layer_021": 0.064453, "value_mse_loss_layer_022": 0.062012, "value_mse_loss_layer_023": 0.070312, "value_mse_loss_layer_024": 0.071289, "value_mse_loss_layer_025": 0.101074, "value_mse_loss_layer_026": 0.076172, "value_mse_loss_layer_027": 0.099609, "value_mse_loss_layer_028": 0.094727, "value_mse_loss_layer_029": 0.144531, "value_mse_loss_layer_030": 0.125, "value_mse_loss_layer_031": 0.180664, "vq_loss_layer_000": 2e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.9e-05, "vq_loss_layer_004": 0.000137, "vq_loss_layer_005": 0.000164, "vq_loss_layer_006": 0.000265, "vq_loss_layer_007": 0.000389, "vq_loss_layer_008": 0.000422, "vq_loss_layer_009": 0.000565, "vq_loss_layer_010": 0.000504, "vq_loss_layer_011": 0.000523, "vq_loss_layer_012": 0.000805, "vq_loss_layer_013": 0.000698, "vq_loss_layer_014": 0.000877, "vq_loss_layer_015": 0.000797, "vq_loss_layer_016": 0.000813, "vq_loss_layer_017": 0.000706, "vq_loss_layer_018": 0.000404, "vq_loss_layer_019": 0.000334, "vq_loss_layer_020": 0.000427, "vq_loss_layer_021": 0.000965, "vq_loss_layer_022": 0.000603, "vq_loss_layer_023": 0.000683, "vq_loss_layer_024": 0.0005, "vq_loss_layer_025": 0.000767, "vq_loss_layer_026": 0.001076, "vq_loss_layer_027": 0.00106, "vq_loss_layer_028": 0.001389, "vq_loss_layer_029": 0.002457, "vq_loss_layer_030": 0.003464, "vq_loss_layer_031": 0.011719 }, { "ce_loss": 2.2478, "epoch": 0.00064, "grad_norm": 0.008626158349215984, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.068781, "kv_vq_loss": 0.001217, "learning_rate": 0.0007015449934959717, "loss": 0.069946, "step": 640, "value_mse_loss_layer_000": 0.001305, "value_mse_loss_layer_001": 0.003571, "value_mse_loss_layer_002": 0.014587, "value_mse_loss_layer_003": 0.022217, "value_mse_loss_layer_004": 0.02002, "value_mse_loss_layer_005": 0.020508, "value_mse_loss_layer_006": 0.02356, "value_mse_loss_layer_007": 0.026611, "value_mse_loss_layer_008": 0.030151, "value_mse_loss_layer_009": 0.039551, "value_mse_loss_layer_010": 0.035156, "value_mse_loss_layer_011": 0.038086, "value_mse_loss_layer_012": 0.042969, "value_mse_loss_layer_013": 0.040039, "value_mse_loss_layer_014": 0.041748, "value_mse_loss_layer_015": 0.04126, "value_mse_loss_layer_016": 0.042725, "value_mse_loss_layer_017": 0.041016, "value_mse_loss_layer_018": 0.041504, "value_mse_loss_layer_019": 0.050293, "value_mse_loss_layer_020": 0.052734, "value_mse_loss_layer_021": 0.0625, "value_mse_loss_layer_022": 0.056641, "value_mse_loss_layer_023": 0.066895, "value_mse_loss_layer_024": 0.077637, "value_mse_loss_layer_025": 0.09082, "value_mse_loss_layer_026": 0.074219, "value_mse_loss_layer_027": 0.104492, "value_mse_loss_layer_028": 0.10791, "value_mse_loss_layer_029": 0.167969, "value_mse_loss_layer_030": 0.123535, "value_mse_loss_layer_031": 0.175781, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 5e-05, "vq_loss_layer_004": 0.00014, "vq_loss_layer_005": 0.000165, "vq_loss_layer_006": 0.00025, "vq_loss_layer_007": 0.000393, "vq_loss_layer_008": 0.000393, "vq_loss_layer_009": 0.000546, "vq_loss_layer_010": 0.000454, "vq_loss_layer_011": 0.000507, "vq_loss_layer_012": 0.001221, "vq_loss_layer_013": 0.000679, "vq_loss_layer_014": 0.000866, "vq_loss_layer_015": 0.000713, "vq_loss_layer_016": 0.001015, "vq_loss_layer_017": 0.00066, "vq_loss_layer_018": 0.000401, "vq_loss_layer_019": 0.000443, "vq_loss_layer_020": 0.000504, "vq_loss_layer_021": 0.00095, "vq_loss_layer_022": 0.0005, "vq_loss_layer_023": 0.00066, "vq_loss_layer_024": 0.000637, "vq_loss_layer_025": 0.000744, "vq_loss_layer_026": 0.001183, "vq_loss_layer_027": 0.001305, "vq_loss_layer_028": 0.001884, "vq_loss_layer_029": 0.002884, "vq_loss_layer_030": 0.003189, "vq_loss_layer_031": 0.010315 }, { "ce_loss": 2.274983, "epoch": 0.00065, "grad_norm": 0.0070764231495559216, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.068573, "kv_vq_loss": 0.001181, "learning_rate": 0.0007032283391607138, "loss": 0.069702, "step": 650, "value_mse_loss_layer_000": 0.001305, "value_mse_loss_layer_001": 0.003586, "value_mse_loss_layer_002": 0.014221, "value_mse_loss_layer_003": 0.023438, "value_mse_loss_layer_004": 0.02002, "value_mse_loss_layer_005": 0.019897, "value_mse_loss_layer_006": 0.02417, "value_mse_loss_layer_007": 0.02771, "value_mse_loss_layer_008": 0.030518, "value_mse_loss_layer_009": 0.040527, "value_mse_loss_layer_010": 0.034424, "value_mse_loss_layer_011": 0.03833, "value_mse_loss_layer_012": 0.039062, "value_mse_loss_layer_013": 0.04126, "value_mse_loss_layer_014": 0.044434, "value_mse_loss_layer_015": 0.046143, "value_mse_loss_layer_016": 0.041504, "value_mse_loss_layer_017": 0.04248, "value_mse_loss_layer_018": 0.054443, "value_mse_loss_layer_019": 0.054199, "value_mse_loss_layer_020": 0.061279, "value_mse_loss_layer_021": 0.074707, "value_mse_loss_layer_022": 0.059326, "value_mse_loss_layer_023": 0.084473, "value_mse_loss_layer_024": 0.083984, "value_mse_loss_layer_025": 0.106934, "value_mse_loss_layer_026": 0.081055, "value_mse_loss_layer_027": 0.111816, "value_mse_loss_layer_028": 0.105469, "value_mse_loss_layer_029": 0.15918, "value_mse_loss_layer_030": 0.137695, "value_mse_loss_layer_031": 0.177734, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.000118, "vq_loss_layer_005": 0.000153, "vq_loss_layer_006": 0.00029, "vq_loss_layer_007": 0.000441, "vq_loss_layer_008": 0.000389, "vq_loss_layer_009": 0.000595, "vq_loss_layer_010": 0.000423, "vq_loss_layer_011": 0.000523, "vq_loss_layer_012": 0.000832, "vq_loss_layer_013": 0.000694, "vq_loss_layer_014": 0.000847, "vq_loss_layer_015": 0.000877, "vq_loss_layer_016": 0.000881, "vq_loss_layer_017": 0.000652, "vq_loss_layer_018": 0.000572, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.000515, "vq_loss_layer_021": 0.001053, "vq_loss_layer_022": 0.000412, "vq_loss_layer_023": 0.000835, "vq_loss_layer_024": 0.000565, "vq_loss_layer_025": 0.000713, "vq_loss_layer_026": 0.000977, "vq_loss_layer_027": 0.001236, "vq_loss_layer_028": 0.001564, "vq_loss_layer_029": 0.002625, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.009949 }, { "ce_loss": 2.244173, "epoch": 0.00066, "grad_norm": 0.00500284181907773, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.068274, "kv_vq_loss": 0.001181, "learning_rate": 0.0007048859838854671, "loss": 0.069415, "step": 660, "value_mse_loss_layer_000": 0.001335, "value_mse_loss_layer_001": 0.003586, "value_mse_loss_layer_002": 0.01416, "value_mse_loss_layer_003": 0.023804, "value_mse_loss_layer_004": 0.021484, "value_mse_loss_layer_005": 0.025269, "value_mse_loss_layer_006": 0.024048, "value_mse_loss_layer_007": 0.0271, "value_mse_loss_layer_008": 0.030884, "value_mse_loss_layer_009": 0.040771, "value_mse_loss_layer_010": 0.036865, "value_mse_loss_layer_011": 0.039062, "value_mse_loss_layer_012": 0.041016, "value_mse_loss_layer_013": 0.040283, "value_mse_loss_layer_014": 0.043457, "value_mse_loss_layer_015": 0.043945, "value_mse_loss_layer_016": 0.041504, "value_mse_loss_layer_017": 0.041504, "value_mse_loss_layer_018": 0.042969, "value_mse_loss_layer_019": 0.050049, "value_mse_loss_layer_020": 0.052979, "value_mse_loss_layer_021": 0.066406, "value_mse_loss_layer_022": 0.061035, "value_mse_loss_layer_023": 0.070312, "value_mse_loss_layer_024": 0.07666, "value_mse_loss_layer_025": 0.096191, "value_mse_loss_layer_026": 0.082031, "value_mse_loss_layer_027": 0.102051, "value_mse_loss_layer_028": 0.097656, "value_mse_loss_layer_029": 0.147461, "value_mse_loss_layer_030": 0.134766, "value_mse_loss_layer_031": 0.189453, "vq_loss_layer_000": 2e-05, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 5.9e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 0.000299, "vq_loss_layer_006": 0.000271, "vq_loss_layer_007": 0.000378, "vq_loss_layer_008": 0.00045, "vq_loss_layer_009": 0.000576, "vq_loss_layer_010": 0.000546, "vq_loss_layer_011": 0.000576, "vq_loss_layer_012": 0.000854, "vq_loss_layer_013": 0.000706, "vq_loss_layer_014": 0.000858, "vq_loss_layer_015": 0.000851, "vq_loss_layer_016": 0.001015, "vq_loss_layer_017": 0.00071, "vq_loss_layer_018": 0.000465, "vq_loss_layer_019": 0.000393, "vq_loss_layer_020": 0.000526, "vq_loss_layer_021": 0.000957, "vq_loss_layer_022": 0.000633, "vq_loss_layer_023": 0.000648, "vq_loss_layer_024": 0.000652, "vq_loss_layer_025": 0.000912, "vq_loss_layer_026": 0.001282, "vq_loss_layer_027": 0.001358, "vq_loss_layer_028": 0.001572, "vq_loss_layer_029": 0.002975, "vq_loss_layer_030": 0.004395, "vq_loss_layer_031": 0.012451 }, { "ce_loss": 2.286831, "epoch": 0.00067, "grad_norm": 0.0061735594645142555, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.067938, "kv_vq_loss": 0.00111, "learning_rate": 0.0007065187006752065, "loss": 0.069006, "step": 670, "value_mse_loss_layer_000": 0.001305, "value_mse_loss_layer_001": 0.003525, "value_mse_loss_layer_002": 0.014343, "value_mse_loss_layer_003": 0.022095, "value_mse_loss_layer_004": 0.019409, "value_mse_loss_layer_005": 0.019531, "value_mse_loss_layer_006": 0.024536, "value_mse_loss_layer_007": 0.026367, "value_mse_loss_layer_008": 0.030029, "value_mse_loss_layer_009": 0.040039, "value_mse_loss_layer_010": 0.035645, "value_mse_loss_layer_011": 0.038818, "value_mse_loss_layer_012": 0.040283, "value_mse_loss_layer_013": 0.040771, "value_mse_loss_layer_014": 0.044189, "value_mse_loss_layer_015": 0.044434, "value_mse_loss_layer_016": 0.041016, "value_mse_loss_layer_017": 0.045166, "value_mse_loss_layer_018": 0.044922, "value_mse_loss_layer_019": 0.057129, "value_mse_loss_layer_020": 0.053711, "value_mse_loss_layer_021": 0.067871, "value_mse_loss_layer_022": 0.058594, "value_mse_loss_layer_023": 0.07373, "value_mse_loss_layer_024": 0.091309, "value_mse_loss_layer_025": 0.094727, "value_mse_loss_layer_026": 0.074707, "value_mse_loss_layer_027": 0.098145, "value_mse_loss_layer_028": 0.097168, "value_mse_loss_layer_029": 0.154297, "value_mse_loss_layer_030": 0.132812, "value_mse_loss_layer_031": 0.167969, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000309, "vq_loss_layer_007": 0.000402, "vq_loss_layer_008": 0.000341, "vq_loss_layer_009": 0.000515, "vq_loss_layer_010": 0.000423, "vq_loss_layer_011": 0.000477, "vq_loss_layer_012": 0.000828, "vq_loss_layer_013": 0.000725, "vq_loss_layer_014": 0.00079, "vq_loss_layer_015": 0.000866, "vq_loss_layer_016": 0.000858, "vq_loss_layer_017": 0.001213, "vq_loss_layer_018": 0.000439, "vq_loss_layer_019": 0.000374, "vq_loss_layer_020": 0.000481, "vq_loss_layer_021": 0.000942, "vq_loss_layer_022": 0.000446, "vq_loss_layer_023": 0.000717, "vq_loss_layer_024": 0.000603, "vq_loss_layer_025": 0.000668, "vq_loss_layer_026": 0.000912, "vq_loss_layer_027": 0.000961, "vq_loss_layer_028": 0.001205, "vq_loss_layer_029": 0.002426, "vq_loss_layer_030": 0.003815, "vq_loss_layer_031": 0.009155 }, { "ce_loss": 2.28842, "epoch": 0.00068, "grad_norm": 0.005936917383223772, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.06734, "kv_vq_loss": 0.001096, "learning_rate": 0.0007081272281765589, "loss": 0.068384, "step": 680, "value_mse_loss_layer_000": 0.001328, "value_mse_loss_layer_001": 0.003555, "value_mse_loss_layer_002": 0.013916, "value_mse_loss_layer_003": 0.022339, "value_mse_loss_layer_004": 0.02002, "value_mse_loss_layer_005": 0.020386, "value_mse_loss_layer_006": 0.023926, "value_mse_loss_layer_007": 0.026123, "value_mse_loss_layer_008": 0.031982, "value_mse_loss_layer_009": 0.039307, "value_mse_loss_layer_010": 0.034912, "value_mse_loss_layer_011": 0.03833, "value_mse_loss_layer_012": 0.037109, "value_mse_loss_layer_013": 0.039551, "value_mse_loss_layer_014": 0.042969, "value_mse_loss_layer_015": 0.04126, "value_mse_loss_layer_016": 0.040039, "value_mse_loss_layer_017": 0.041016, "value_mse_loss_layer_018": 0.044678, "value_mse_loss_layer_019": 0.052246, "value_mse_loss_layer_020": 0.053711, "value_mse_loss_layer_021": 0.063965, "value_mse_loss_layer_022": 0.055664, "value_mse_loss_layer_023": 0.070801, "value_mse_loss_layer_024": 0.07373, "value_mse_loss_layer_025": 0.097168, "value_mse_loss_layer_026": 0.075684, "value_mse_loss_layer_027": 0.101562, "value_mse_loss_layer_028": 0.117676, "value_mse_loss_layer_029": 0.149414, "value_mse_loss_layer_030": 0.130859, "value_mse_loss_layer_031": 0.175781, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 0.00012, "vq_loss_layer_005": 0.000148, "vq_loss_layer_006": 0.000278, "vq_loss_layer_007": 0.000362, "vq_loss_layer_008": 0.000465, "vq_loss_layer_009": 0.000515, "vq_loss_layer_010": 0.000423, "vq_loss_layer_011": 0.000549, "vq_loss_layer_012": 0.000759, "vq_loss_layer_013": 0.00066, "vq_loss_layer_014": 0.000828, "vq_loss_layer_015": 0.000805, "vq_loss_layer_016": 0.000877, "vq_loss_layer_017": 0.000618, "vq_loss_layer_018": 0.000423, "vq_loss_layer_019": 0.000345, "vq_loss_layer_020": 0.000412, "vq_loss_layer_021": 0.000793, "vq_loss_layer_022": 0.00042, "vq_loss_layer_023": 0.000553, "vq_loss_layer_024": 0.000448, "vq_loss_layer_025": 0.000652, "vq_loss_layer_026": 0.000839, "vq_loss_layer_027": 0.001007, "vq_loss_layer_028": 0.00193, "vq_loss_layer_029": 0.002487, "vq_loss_layer_030": 0.00354, "vq_loss_layer_031": 0.009888 }, { "ce_loss": 2.254081, "epoch": 0.00069, "grad_norm": 0.008674518205225468, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.067767, "kv_vq_loss": 0.001147, "learning_rate": 0.0007097122726843138, "loss": 0.068854, "step": 690, "value_mse_loss_layer_000": 0.001289, "value_mse_loss_layer_001": 0.003479, "value_mse_loss_layer_002": 0.013611, "value_mse_loss_layer_003": 0.024658, "value_mse_loss_layer_004": 0.021484, "value_mse_loss_layer_005": 0.020264, "value_mse_loss_layer_006": 0.023438, "value_mse_loss_layer_007": 0.027588, "value_mse_loss_layer_008": 0.030151, "value_mse_loss_layer_009": 0.041016, "value_mse_loss_layer_010": 0.035645, "value_mse_loss_layer_011": 0.038818, "value_mse_loss_layer_012": 0.039795, "value_mse_loss_layer_013": 0.040527, "value_mse_loss_layer_014": 0.044189, "value_mse_loss_layer_015": 0.042725, "value_mse_loss_layer_016": 0.039307, "value_mse_loss_layer_017": 0.041504, "value_mse_loss_layer_018": 0.041992, "value_mse_loss_layer_019": 0.050537, "value_mse_loss_layer_020": 0.051025, "value_mse_loss_layer_021": 0.077637, "value_mse_loss_layer_022": 0.057373, "value_mse_loss_layer_023": 0.069336, "value_mse_loss_layer_024": 0.071777, "value_mse_loss_layer_025": 0.098633, "value_mse_loss_layer_026": 0.088379, "value_mse_loss_layer_027": 0.101074, "value_mse_loss_layer_028": 0.099609, "value_mse_loss_layer_029": 0.157227, "value_mse_loss_layer_030": 0.15918, "value_mse_loss_layer_031": 0.175781, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 6.8e-05, "vq_loss_layer_004": 0.000196, "vq_loss_layer_005": 0.000163, "vq_loss_layer_006": 0.000275, "vq_loss_layer_007": 0.000437, "vq_loss_layer_008": 0.000414, "vq_loss_layer_009": 0.000629, "vq_loss_layer_010": 0.000504, "vq_loss_layer_011": 0.000546, "vq_loss_layer_012": 0.000923, "vq_loss_layer_013": 0.000652, "vq_loss_layer_014": 0.000896, "vq_loss_layer_015": 0.000942, "vq_loss_layer_016": 0.000824, "vq_loss_layer_017": 0.000694, "vq_loss_layer_018": 0.000446, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.000431, "vq_loss_layer_021": 0.001251, "vq_loss_layer_022": 0.000486, "vq_loss_layer_023": 0.000694, "vq_loss_layer_024": 0.000549, "vq_loss_layer_025": 0.00079, "vq_loss_layer_026": 0.001511, "vq_loss_layer_027": 0.001236, "vq_loss_layer_028": 0.001648, "vq_loss_layer_029": 0.002945, "vq_loss_layer_030": 0.005463, "vq_loss_layer_031": 0.010742 }, { "ce_loss": 2.284125, "epoch": 0.0007, "grad_norm": 0.006798422895371914, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.061523, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.067444, "kv_vq_loss": 0.001147, "learning_rate": 0.0007112745100035641, "loss": 0.068518, "step": 700, "value_mse_loss_layer_000": 0.001312, "value_mse_loss_layer_001": 0.00351, "value_mse_loss_layer_002": 0.013794, "value_mse_loss_layer_003": 0.021851, "value_mse_loss_layer_004": 0.019897, "value_mse_loss_layer_005": 0.021606, "value_mse_loss_layer_006": 0.022705, "value_mse_loss_layer_007": 0.025635, "value_mse_loss_layer_008": 0.029419, "value_mse_loss_layer_009": 0.039062, "value_mse_loss_layer_010": 0.036133, "value_mse_loss_layer_011": 0.037354, "value_mse_loss_layer_012": 0.037842, "value_mse_loss_layer_013": 0.042236, "value_mse_loss_layer_014": 0.04248, "value_mse_loss_layer_015": 0.04126, "value_mse_loss_layer_016": 0.038086, "value_mse_loss_layer_017": 0.042236, "value_mse_loss_layer_018": 0.043701, "value_mse_loss_layer_019": 0.050049, "value_mse_loss_layer_020": 0.053467, "value_mse_loss_layer_021": 0.063965, "value_mse_loss_layer_022": 0.05957, "value_mse_loss_layer_023": 0.075195, "value_mse_loss_layer_024": 0.074707, "value_mse_loss_layer_025": 0.105469, "value_mse_loss_layer_026": 0.078613, "value_mse_loss_layer_027": 0.105469, "value_mse_loss_layer_028": 0.099121, "value_mse_loss_layer_029": 0.155273, "value_mse_loss_layer_030": 0.137695, "value_mse_loss_layer_031": 0.176758, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 0.000199, "vq_loss_layer_006": 0.000243, "vq_loss_layer_007": 0.000383, "vq_loss_layer_008": 0.000355, "vq_loss_layer_009": 0.00053, "vq_loss_layer_010": 0.000504, "vq_loss_layer_011": 0.000486, "vq_loss_layer_012": 0.000824, "vq_loss_layer_013": 0.000832, "vq_loss_layer_014": 0.000828, "vq_loss_layer_015": 0.000824, "vq_loss_layer_016": 0.000782, "vq_loss_layer_017": 0.000881, "vq_loss_layer_018": 0.000484, "vq_loss_layer_019": 0.000326, "vq_loss_layer_020": 0.000467, "vq_loss_layer_021": 0.000828, "vq_loss_layer_022": 0.0005, "vq_loss_layer_023": 0.000694, "vq_loss_layer_024": 0.000456, "vq_loss_layer_025": 0.000656, "vq_loss_layer_026": 0.000923, "vq_loss_layer_027": 0.001022, "vq_loss_layer_028": 0.001442, "vq_loss_layer_029": 0.002533, "vq_loss_layer_030": 0.003586, "vq_loss_layer_031": 0.009399 }, { "ce_loss": 2.287302, "epoch": 0.00071, "grad_norm": 0.004076071549206972, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.067255, "kv_vq_loss": 0.001126, "learning_rate": 0.0007128145871797688, "loss": 0.068347, "step": 710, "value_mse_loss_layer_000": 0.001289, "value_mse_loss_layer_001": 0.003479, "value_mse_loss_layer_002": 0.014038, "value_mse_loss_layer_003": 0.021729, "value_mse_loss_layer_004": 0.018799, "value_mse_loss_layer_005": 0.019653, "value_mse_loss_layer_006": 0.022583, "value_mse_loss_layer_007": 0.025635, "value_mse_loss_layer_008": 0.029419, "value_mse_loss_layer_009": 0.038574, "value_mse_loss_layer_010": 0.034424, "value_mse_loss_layer_011": 0.037354, "value_mse_loss_layer_012": 0.037598, "value_mse_loss_layer_013": 0.038574, "value_mse_loss_layer_014": 0.042969, "value_mse_loss_layer_015": 0.042236, "value_mse_loss_layer_016": 0.041504, "value_mse_loss_layer_017": 0.04126, "value_mse_loss_layer_018": 0.044189, "value_mse_loss_layer_019": 0.050537, "value_mse_loss_layer_020": 0.05249, "value_mse_loss_layer_021": 0.063965, "value_mse_loss_layer_022": 0.055176, "value_mse_loss_layer_023": 0.074707, "value_mse_loss_layer_024": 0.074707, "value_mse_loss_layer_025": 0.092285, "value_mse_loss_layer_026": 0.075195, "value_mse_loss_layer_027": 0.097656, "value_mse_loss_layer_028": 0.095703, "value_mse_loss_layer_029": 0.143555, "value_mse_loss_layer_030": 0.126953, "value_mse_loss_layer_031": 0.170898, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.000107, "vq_loss_layer_005": 0.000154, "vq_loss_layer_006": 0.000235, "vq_loss_layer_007": 0.000368, "vq_loss_layer_008": 0.000345, "vq_loss_layer_009": 0.000475, "vq_loss_layer_010": 0.000412, "vq_loss_layer_011": 0.000483, "vq_loss_layer_012": 0.000774, "vq_loss_layer_013": 0.000622, "vq_loss_layer_014": 0.000774, "vq_loss_layer_015": 0.000751, "vq_loss_layer_016": 0.000904, "vq_loss_layer_017": 0.000607, "vq_loss_layer_018": 0.000416, "vq_loss_layer_019": 0.000322, "vq_loss_layer_020": 0.000372, "vq_loss_layer_021": 0.000847, "vq_loss_layer_022": 0.000404, "vq_loss_layer_023": 0.000698, "vq_loss_layer_024": 0.000523, "vq_loss_layer_025": 0.000595, "vq_loss_layer_026": 0.000832, "vq_loss_layer_027": 0.000938, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.002441, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.009583 }, { "ce_loss": 2.310487, "epoch": 0.00072, "grad_norm": 0.005368347745388746, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.067126, "kv_vq_loss": 0.001096, "learning_rate": 0.0007143331241078171, "loss": 0.068195, "step": 720, "value_mse_loss_layer_000": 0.001289, "value_mse_loss_layer_001": 0.003479, "value_mse_loss_layer_002": 0.014099, "value_mse_loss_layer_003": 0.025024, "value_mse_loss_layer_004": 0.021851, "value_mse_loss_layer_005": 0.020264, "value_mse_loss_layer_006": 0.022827, "value_mse_loss_layer_007": 0.026489, "value_mse_loss_layer_008": 0.030029, "value_mse_loss_layer_009": 0.03833, "value_mse_loss_layer_010": 0.033936, "value_mse_loss_layer_011": 0.037109, "value_mse_loss_layer_012": 0.04126, "value_mse_loss_layer_013": 0.038574, "value_mse_loss_layer_014": 0.041748, "value_mse_loss_layer_015": 0.041748, "value_mse_loss_layer_016": 0.037842, "value_mse_loss_layer_017": 0.040039, "value_mse_loss_layer_018": 0.04248, "value_mse_loss_layer_019": 0.047852, "value_mse_loss_layer_020": 0.048828, "value_mse_loss_layer_021": 0.060791, "value_mse_loss_layer_022": 0.057617, "value_mse_loss_layer_023": 0.068359, "value_mse_loss_layer_024": 0.072266, "value_mse_loss_layer_025": 0.090332, "value_mse_loss_layer_026": 0.072266, "value_mse_loss_layer_027": 0.097168, "value_mse_loss_layer_028": 0.109375, "value_mse_loss_layer_029": 0.140625, "value_mse_loss_layer_030": 0.126953, "value_mse_loss_layer_031": 0.171875, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 5.3e-05, "vq_loss_layer_004": 0.000222, "vq_loss_layer_005": 0.000169, "vq_loss_layer_006": 0.000244, "vq_loss_layer_007": 0.000397, "vq_loss_layer_008": 0.000393, "vq_loss_layer_009": 0.000484, "vq_loss_layer_010": 0.000439, "vq_loss_layer_011": 0.000507, "vq_loss_layer_012": 0.001053, "vq_loss_layer_013": 0.000607, "vq_loss_layer_014": 0.000805, "vq_loss_layer_015": 0.000778, "vq_loss_layer_016": 0.000832, "vq_loss_layer_017": 0.000629, "vq_loss_layer_018": 0.000463, "vq_loss_layer_019": 0.000345, "vq_loss_layer_020": 0.000389, "vq_loss_layer_021": 0.000816, "vq_loss_layer_022": 0.000511, "vq_loss_layer_023": 0.000668, "vq_loss_layer_024": 0.000576, "vq_loss_layer_025": 0.000668, "vq_loss_layer_026": 0.000969, "vq_loss_layer_027": 0.001083, "vq_loss_layer_028": 0.001984, "vq_loss_layer_029": 0.002243, "vq_loss_layer_030": 0.003571, "vq_loss_layer_031": 0.010071 }, { "ce_loss": 2.29084, "epoch": 0.00073, "grad_norm": 0.00714690750464797, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.067041, "kv_vq_loss": 0.001101, "learning_rate": 0.0007158307150301139, "loss": 0.068103, "step": 730, "value_mse_loss_layer_000": 0.001266, "value_mse_loss_layer_001": 0.003494, "value_mse_loss_layer_002": 0.013794, "value_mse_loss_layer_003": 0.022339, "value_mse_loss_layer_004": 0.020142, "value_mse_loss_layer_005": 0.020386, "value_mse_loss_layer_006": 0.023682, "value_mse_loss_layer_007": 0.026733, "value_mse_loss_layer_008": 0.029785, "value_mse_loss_layer_009": 0.040771, "value_mse_loss_layer_010": 0.037598, "value_mse_loss_layer_011": 0.039307, "value_mse_loss_layer_012": 0.040039, "value_mse_loss_layer_013": 0.039795, "value_mse_loss_layer_014": 0.042969, "value_mse_loss_layer_015": 0.043213, "value_mse_loss_layer_016": 0.039307, "value_mse_loss_layer_017": 0.040527, "value_mse_loss_layer_018": 0.042236, "value_mse_loss_layer_019": 0.05249, "value_mse_loss_layer_020": 0.051758, "value_mse_loss_layer_021": 0.062988, "value_mse_loss_layer_022": 0.056152, "value_mse_loss_layer_023": 0.065918, "value_mse_loss_layer_024": 0.073242, "value_mse_loss_layer_025": 0.104492, "value_mse_loss_layer_026": 0.075195, "value_mse_loss_layer_027": 0.10498, "value_mse_loss_layer_028": 0.103516, "value_mse_loss_layer_029": 0.152344, "value_mse_loss_layer_030": 0.151367, "value_mse_loss_layer_031": 0.174805, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 0.000129, "vq_loss_layer_005": 0.000176, "vq_loss_layer_006": 0.000296, "vq_loss_layer_007": 0.000416, "vq_loss_layer_008": 0.000389, "vq_loss_layer_009": 0.000618, "vq_loss_layer_010": 0.00053, "vq_loss_layer_011": 0.000603, "vq_loss_layer_012": 0.000904, "vq_loss_layer_013": 0.00066, "vq_loss_layer_014": 0.000824, "vq_loss_layer_015": 0.000957, "vq_loss_layer_016": 0.000881, "vq_loss_layer_017": 0.000694, "vq_loss_layer_018": 0.000507, "vq_loss_layer_019": 0.000435, "vq_loss_layer_020": 0.000422, "vq_loss_layer_021": 0.0009, "vq_loss_layer_022": 0.000511, "vq_loss_layer_023": 0.000626, "vq_loss_layer_024": 0.000622, "vq_loss_layer_025": 0.000961, "vq_loss_layer_026": 0.001106, "vq_loss_layer_027": 0.001419, "vq_loss_layer_028": 0.00177, "vq_loss_layer_029": 0.002869, "vq_loss_layer_030": 0.004913, "vq_loss_layer_031": 0.010254 }, { "ce_loss": 2.296687, "epoch": 0.00074, "grad_norm": 0.0061213006265461445, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.066763, "kv_vq_loss": 0.001085, "learning_rate": 0.0007173079299327439, "loss": 0.067816, "step": 740, "value_mse_loss_layer_000": 0.001282, "value_mse_loss_layer_001": 0.003433, "value_mse_loss_layer_002": 0.013855, "value_mse_loss_layer_003": 0.022949, "value_mse_loss_layer_004": 0.019409, "value_mse_loss_layer_005": 0.02002, "value_mse_loss_layer_006": 0.022949, "value_mse_loss_layer_007": 0.026367, "value_mse_loss_layer_008": 0.030151, "value_mse_loss_layer_009": 0.039062, "value_mse_loss_layer_010": 0.03418, "value_mse_loss_layer_011": 0.037842, "value_mse_loss_layer_012": 0.039062, "value_mse_loss_layer_013": 0.040527, "value_mse_loss_layer_014": 0.043457, "value_mse_loss_layer_015": 0.04126, "value_mse_loss_layer_016": 0.039307, "value_mse_loss_layer_017": 0.04126, "value_mse_loss_layer_018": 0.04126, "value_mse_loss_layer_019": 0.048096, "value_mse_loss_layer_020": 0.050537, "value_mse_loss_layer_021": 0.072754, "value_mse_loss_layer_022": 0.054932, "value_mse_loss_layer_023": 0.066406, "value_mse_loss_layer_024": 0.072754, "value_mse_loss_layer_025": 0.09668, "value_mse_loss_layer_026": 0.074707, "value_mse_loss_layer_027": 0.100586, "value_mse_loss_layer_028": 0.096191, "value_mse_loss_layer_029": 0.141602, "value_mse_loss_layer_030": 0.126953, "value_mse_loss_layer_031": 0.169922, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 5.4e-05, "vq_loss_layer_004": 0.000118, "vq_loss_layer_005": 0.000162, "vq_loss_layer_006": 0.000273, "vq_loss_layer_007": 0.000425, "vq_loss_layer_008": 0.000422, "vq_loss_layer_009": 0.000534, "vq_loss_layer_010": 0.000456, "vq_loss_layer_011": 0.000542, "vq_loss_layer_012": 0.000877, "vq_loss_layer_013": 0.000729, "vq_loss_layer_014": 0.000801, "vq_loss_layer_015": 0.000736, "vq_loss_layer_016": 0.000881, "vq_loss_layer_017": 0.00082, "vq_loss_layer_018": 0.000429, "vq_loss_layer_019": 0.000355, "vq_loss_layer_020": 0.000441, "vq_loss_layer_021": 0.001083, "vq_loss_layer_022": 0.000473, "vq_loss_layer_023": 0.000645, "vq_loss_layer_024": 0.000607, "vq_loss_layer_025": 0.000717, "vq_loss_layer_026": 0.001007, "vq_loss_layer_027": 0.001099, "vq_loss_layer_028": 0.001366, "vq_loss_layer_029": 0.002319, "vq_loss_layer_030": 0.003479, "vq_loss_layer_031": 0.009827 }, { "ce_loss": 2.277749, "epoch": 0.00075, "grad_norm": 0.008274671621620655, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.012451, "key_mse_loss_layer_002": 0.063477, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.07666, "key_mse_loss_layer_007": 0.082031, "key_mse_loss_layer_008": 0.093262, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.137695, "key_mse_loss_layer_014": 0.132812, "key_mse_loss_layer_015": 0.125, "key_mse_loss_layer_016": 0.117188, "key_mse_loss_layer_017": 0.117188, "key_mse_loss_layer_018": 0.125977, "key_mse_loss_layer_019": 0.103516, "key_mse_loss_layer_020": 0.120605, "key_mse_loss_layer_021": 0.111328, "key_mse_loss_layer_022": 0.116211, "key_mse_loss_layer_023": 0.115234, "key_mse_loss_layer_024": 0.094727, "key_mse_loss_layer_025": 0.086426, "key_mse_loss_layer_026": 0.108887, "key_mse_loss_layer_027": 0.106934, "key_mse_loss_layer_028": 0.111816, "key_mse_loss_layer_029": 0.105957, "key_mse_loss_layer_030": 0.116211, "key_mse_loss_layer_031": 0.083984, "kv_mse_loss": 0.06698, "kv_vq_loss": 0.001078, "learning_rate": 0.000718765315847925, "loss": 0.068036, "step": 750, "value_mse_loss_layer_000": 0.001221, "value_mse_loss_layer_001": 0.003433, "value_mse_loss_layer_002": 0.014038, "value_mse_loss_layer_003": 0.022583, "value_mse_loss_layer_004": 0.024292, "value_mse_loss_layer_005": 0.02124, "value_mse_loss_layer_006": 0.024536, "value_mse_loss_layer_007": 0.027832, "value_mse_loss_layer_008": 0.029297, "value_mse_loss_layer_009": 0.03833, "value_mse_loss_layer_010": 0.035156, "value_mse_loss_layer_011": 0.036621, "value_mse_loss_layer_012": 0.051025, "value_mse_loss_layer_013": 0.039551, "value_mse_loss_layer_014": 0.041504, "value_mse_loss_layer_015": 0.040527, "value_mse_loss_layer_016": 0.036621, "value_mse_loss_layer_017": 0.041016, "value_mse_loss_layer_018": 0.041016, "value_mse_loss_layer_019": 0.048584, "value_mse_loss_layer_020": 0.052734, "value_mse_loss_layer_021": 0.065918, "value_mse_loss_layer_022": 0.051025, "value_mse_loss_layer_023": 0.089844, "value_mse_loss_layer_024": 0.080078, "value_mse_loss_layer_025": 0.102539, "value_mse_loss_layer_026": 0.094727, "value_mse_loss_layer_027": 0.117676, "value_mse_loss_layer_028": 0.094238, "value_mse_loss_layer_029": 0.180664, "value_mse_loss_layer_030": 0.166992, "value_mse_loss_layer_031": 0.208984, "vq_loss_layer_000": 1.9e-05, "vq_loss_layer_001": 4.3e-05, "vq_loss_layer_002": 6.5e-05, "vq_loss_layer_003": 9.5e-05, "vq_loss_layer_004": 0.000305, "vq_loss_layer_005": 0.000225, "vq_loss_layer_006": 0.000353, "vq_loss_layer_007": 0.000448, "vq_loss_layer_008": 0.000465, "vq_loss_layer_009": 0.000572, "vq_loss_layer_010": 0.000622, "vq_loss_layer_011": 0.000549, "vq_loss_layer_012": 0.002151, "vq_loss_layer_013": 0.000702, "vq_loss_layer_014": 0.000938, "vq_loss_layer_015": 0.00082, "vq_loss_layer_016": 0.000782, "vq_loss_layer_017": 0.000744, "vq_loss_layer_018": 0.000452, "vq_loss_layer_019": 0.000334, "vq_loss_layer_020": 0.000471, "vq_loss_layer_021": 0.001114, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.001747, "vq_loss_layer_024": 0.001259, "vq_loss_layer_025": 0.001343, "vq_loss_layer_026": 0.002289, "vq_loss_layer_027": 0.003159, "vq_loss_layer_028": 0.00209, "vq_loss_layer_029": 0.006317, "vq_loss_layer_030": 0.010132, "vq_loss_layer_031": 0.022461 }, { "ce_loss": 2.29341, "epoch": 0.00076, "grad_norm": 0.005691359285265207, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.066495, "kv_vq_loss": 0.001075, "learning_rate": 0.0007202033980701978, "loss": 0.067532, "step": 760, "value_mse_loss_layer_000": 0.001274, "value_mse_loss_layer_001": 0.003464, "value_mse_loss_layer_002": 0.013794, "value_mse_loss_layer_003": 0.022339, "value_mse_loss_layer_004": 0.02002, "value_mse_loss_layer_005": 0.02002, "value_mse_loss_layer_006": 0.0271, "value_mse_loss_layer_007": 0.025879, "value_mse_loss_layer_008": 0.029541, "value_mse_loss_layer_009": 0.039551, "value_mse_loss_layer_010": 0.035645, "value_mse_loss_layer_011": 0.037842, "value_mse_loss_layer_012": 0.038574, "value_mse_loss_layer_013": 0.039062, "value_mse_loss_layer_014": 0.041992, "value_mse_loss_layer_015": 0.042236, "value_mse_loss_layer_016": 0.03833, "value_mse_loss_layer_017": 0.042236, "value_mse_loss_layer_018": 0.041504, "value_mse_loss_layer_019": 0.047363, "value_mse_loss_layer_020": 0.050537, "value_mse_loss_layer_021": 0.062012, "value_mse_loss_layer_022": 0.056152, "value_mse_loss_layer_023": 0.067383, "value_mse_loss_layer_024": 0.072754, "value_mse_loss_layer_025": 0.089844, "value_mse_loss_layer_026": 0.078613, "value_mse_loss_layer_027": 0.104004, "value_mse_loss_layer_028": 0.095215, "value_mse_loss_layer_029": 0.147461, "value_mse_loss_layer_030": 0.130859, "value_mse_loss_layer_031": 0.175781, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 5.6e-05, "vq_loss_layer_004": 0.000142, "vq_loss_layer_005": 0.000157, "vq_loss_layer_006": 0.0005, "vq_loss_layer_007": 0.00038, "vq_loss_layer_008": 0.000383, "vq_loss_layer_009": 0.000549, "vq_loss_layer_010": 0.000488, "vq_loss_layer_011": 0.000538, "vq_loss_layer_012": 0.000835, "vq_loss_layer_013": 0.000671, "vq_loss_layer_014": 0.000843, "vq_loss_layer_015": 0.000835, "vq_loss_layer_016": 0.000847, "vq_loss_layer_017": 0.000809, "vq_loss_layer_018": 0.000439, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.000477, "vq_loss_layer_021": 0.000919, "vq_loss_layer_022": 0.000538, "vq_loss_layer_023": 0.000694, "vq_loss_layer_024": 0.000549, "vq_loss_layer_025": 0.000751, "vq_loss_layer_026": 0.001213, "vq_loss_layer_027": 0.001381, "vq_loss_layer_028": 0.001663, "vq_loss_layer_029": 0.003311, "vq_loss_layer_030": 0.003647, "vq_loss_layer_031": 0.011841 }, { "ce_loss": 2.345028, "epoch": 0.00077, "grad_norm": 0.00770959910005331, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.066284, "kv_vq_loss": 0.001079, "learning_rate": 0.0007216226812931203, "loss": 0.067334, "step": 770, "value_mse_loss_layer_000": 0.001282, "value_mse_loss_layer_001": 0.003387, "value_mse_loss_layer_002": 0.013794, "value_mse_loss_layer_003": 0.022461, "value_mse_loss_layer_004": 0.02002, "value_mse_loss_layer_005": 0.021118, "value_mse_loss_layer_006": 0.024292, "value_mse_loss_layer_007": 0.027344, "value_mse_loss_layer_008": 0.029053, "value_mse_loss_layer_009": 0.038818, "value_mse_loss_layer_010": 0.033447, "value_mse_loss_layer_011": 0.036377, "value_mse_loss_layer_012": 0.037354, "value_mse_loss_layer_013": 0.039062, "value_mse_loss_layer_014": 0.042236, "value_mse_loss_layer_015": 0.039307, "value_mse_loss_layer_016": 0.038574, "value_mse_loss_layer_017": 0.039307, "value_mse_loss_layer_018": 0.040039, "value_mse_loss_layer_019": 0.046143, "value_mse_loss_layer_020": 0.052979, "value_mse_loss_layer_021": 0.059326, "value_mse_loss_layer_022": 0.052246, "value_mse_loss_layer_023": 0.0625, "value_mse_loss_layer_024": 0.067383, "value_mse_loss_layer_025": 0.087402, "value_mse_loss_layer_026": 0.07373, "value_mse_loss_layer_027": 0.094727, "value_mse_loss_layer_028": 0.089355, "value_mse_loss_layer_029": 0.15918, "value_mse_loss_layer_030": 0.124023, "value_mse_loss_layer_031": 0.180664, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 6.4e-05, "vq_loss_layer_004": 0.000117, "vq_loss_layer_005": 0.000177, "vq_loss_layer_006": 0.00033, "vq_loss_layer_007": 0.000401, "vq_loss_layer_008": 0.000402, "vq_loss_layer_009": 0.000565, "vq_loss_layer_010": 0.000463, "vq_loss_layer_011": 0.000507, "vq_loss_layer_012": 0.000816, "vq_loss_layer_013": 0.00066, "vq_loss_layer_014": 0.000984, "vq_loss_layer_015": 0.000694, "vq_loss_layer_016": 0.000854, "vq_loss_layer_017": 0.000629, "vq_loss_layer_018": 0.000423, "vq_loss_layer_019": 0.000355, "vq_loss_layer_020": 0.000467, "vq_loss_layer_021": 0.000919, "vq_loss_layer_022": 0.000458, "vq_loss_layer_023": 0.00058, "vq_loss_layer_024": 0.000534, "vq_loss_layer_025": 0.000797, "vq_loss_layer_026": 0.001266, "vq_loss_layer_027": 0.001236, "vq_loss_layer_028": 0.001488, "vq_loss_layer_029": 0.002808, "vq_loss_layer_030": 0.003708, "vq_loss_layer_031": 0.012939 }, { "ce_loss": 2.308656, "epoch": 0.00078, "grad_norm": 0.006626766640692949, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.060547, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.07959, "kv_mse_loss": 0.066248, "kv_vq_loss": 0.001092, "learning_rate": 0.00072302365067262, "loss": 0.06731, "step": 780, "value_mse_loss_layer_000": 0.001251, "value_mse_loss_layer_001": 0.003387, "value_mse_loss_layer_002": 0.013184, "value_mse_loss_layer_003": 0.021118, "value_mse_loss_layer_004": 0.018433, "value_mse_loss_layer_005": 0.018311, "value_mse_loss_layer_006": 0.022095, "value_mse_loss_layer_007": 0.024902, "value_mse_loss_layer_008": 0.029297, "value_mse_loss_layer_009": 0.038086, "value_mse_loss_layer_010": 0.036865, "value_mse_loss_layer_011": 0.037598, "value_mse_loss_layer_012": 0.037354, "value_mse_loss_layer_013": 0.039551, "value_mse_loss_layer_014": 0.041992, "value_mse_loss_layer_015": 0.043701, "value_mse_loss_layer_016": 0.038818, "value_mse_loss_layer_017": 0.041016, "value_mse_loss_layer_018": 0.046875, "value_mse_loss_layer_019": 0.058838, "value_mse_loss_layer_020": 0.05127, "value_mse_loss_layer_021": 0.063965, "value_mse_loss_layer_022": 0.059326, "value_mse_loss_layer_023": 0.070312, "value_mse_loss_layer_024": 0.075195, "value_mse_loss_layer_025": 0.100586, "value_mse_loss_layer_026": 0.074219, "value_mse_loss_layer_027": 0.09668, "value_mse_loss_layer_028": 0.095215, "value_mse_loss_layer_029": 0.144531, "value_mse_loss_layer_030": 0.124512, "value_mse_loss_layer_031": 0.167969, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.000133, "vq_loss_layer_006": 0.000237, "vq_loss_layer_007": 0.00036, "vq_loss_layer_008": 0.000362, "vq_loss_layer_009": 0.000475, "vq_loss_layer_010": 0.00045, "vq_loss_layer_011": 0.000511, "vq_loss_layer_012": 0.000771, "vq_loss_layer_013": 0.000668, "vq_loss_layer_014": 0.000771, "vq_loss_layer_015": 0.001167, "vq_loss_layer_016": 0.000862, "vq_loss_layer_017": 0.000648, "vq_loss_layer_018": 0.000456, "vq_loss_layer_019": 0.000401, "vq_loss_layer_020": 0.000402, "vq_loss_layer_021": 0.000854, "vq_loss_layer_022": 0.000511, "vq_loss_layer_023": 0.000576, "vq_loss_layer_024": 0.000553, "vq_loss_layer_025": 0.000671, "vq_loss_layer_026": 0.000889, "vq_loss_layer_027": 0.00103, "vq_loss_layer_028": 0.001396, "vq_loss_layer_029": 0.00264, "vq_loss_layer_030": 0.003571, "vq_loss_layer_031": 0.010498 }, { "ce_loss": 2.291416, "epoch": 0.00079, "grad_norm": 0.007021329831331968, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.066229, "kv_vq_loss": 0.001049, "learning_rate": 0.0007244067728226103, "loss": 0.067255, "step": 790, "value_mse_loss_layer_000": 0.001244, "value_mse_loss_layer_001": 0.003357, "value_mse_loss_layer_002": 0.014526, "value_mse_loss_layer_003": 0.021851, "value_mse_loss_layer_004": 0.02063, "value_mse_loss_layer_005": 0.021118, "value_mse_loss_layer_006": 0.021973, "value_mse_loss_layer_007": 0.025024, "value_mse_loss_layer_008": 0.029175, "value_mse_loss_layer_009": 0.037842, "value_mse_loss_layer_010": 0.033447, "value_mse_loss_layer_011": 0.034912, "value_mse_loss_layer_012": 0.038574, "value_mse_loss_layer_013": 0.035645, "value_mse_loss_layer_014": 0.03833, "value_mse_loss_layer_015": 0.037842, "value_mse_loss_layer_016": 0.034424, "value_mse_loss_layer_017": 0.037354, "value_mse_loss_layer_018": 0.043213, "value_mse_loss_layer_019": 0.049072, "value_mse_loss_layer_020": 0.047607, "value_mse_loss_layer_021": 0.063477, "value_mse_loss_layer_022": 0.053711, "value_mse_loss_layer_023": 0.06543, "value_mse_loss_layer_024": 0.075195, "value_mse_loss_layer_025": 0.092285, "value_mse_loss_layer_026": 0.076172, "value_mse_loss_layer_027": 0.104004, "value_mse_loss_layer_028": 0.094727, "value_mse_loss_layer_029": 0.149414, "value_mse_loss_layer_030": 0.151367, "value_mse_loss_layer_031": 0.171875, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 4.7e-05, "vq_loss_layer_004": 0.000135, "vq_loss_layer_005": 0.000183, "vq_loss_layer_006": 0.000227, "vq_loss_layer_007": 0.000351, "vq_loss_layer_008": 0.000454, "vq_loss_layer_009": 0.000591, "vq_loss_layer_010": 0.000507, "vq_loss_layer_011": 0.000496, "vq_loss_layer_012": 0.001015, "vq_loss_layer_013": 0.000526, "vq_loss_layer_014": 0.000725, "vq_loss_layer_015": 0.000637, "vq_loss_layer_016": 0.000721, "vq_loss_layer_017": 0.000572, "vq_loss_layer_018": 0.000465, "vq_loss_layer_019": 0.000345, "vq_loss_layer_020": 0.00033, "vq_loss_layer_021": 0.000923, "vq_loss_layer_022": 0.000408, "vq_loss_layer_023": 0.00053, "vq_loss_layer_024": 0.00053, "vq_loss_layer_025": 0.000706, "vq_loss_layer_026": 0.001083, "vq_loss_layer_027": 0.001137, "vq_loss_layer_028": 0.001427, "vq_loss_layer_029": 0.002396, "vq_loss_layer_030": 0.004486, "vq_loss_layer_031": 0.010742 }, { "ce_loss": 2.30286, "epoch": 0.0008, "grad_norm": 0.007815069518983364, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.067023, "kv_vq_loss": 0.001119, "learning_rate": 0.0007257724967479857, "loss": 0.068134, "step": 800, "value_mse_loss_layer_000": 0.001244, "value_mse_loss_layer_001": 0.003372, "value_mse_loss_layer_002": 0.013367, "value_mse_loss_layer_003": 0.021484, "value_mse_loss_layer_004": 0.018677, "value_mse_loss_layer_005": 0.019409, "value_mse_loss_layer_006": 0.024658, "value_mse_loss_layer_007": 0.025391, "value_mse_loss_layer_008": 0.029175, "value_mse_loss_layer_009": 0.037598, "value_mse_loss_layer_010": 0.032959, "value_mse_loss_layer_011": 0.035889, "value_mse_loss_layer_012": 0.039551, "value_mse_loss_layer_013": 0.037598, "value_mse_loss_layer_014": 0.042236, "value_mse_loss_layer_015": 0.041016, "value_mse_loss_layer_016": 0.041504, "value_mse_loss_layer_017": 0.039307, "value_mse_loss_layer_018": 0.041748, "value_mse_loss_layer_019": 0.049072, "value_mse_loss_layer_020": 0.049316, "value_mse_loss_layer_021": 0.080566, "value_mse_loss_layer_022": 0.057129, "value_mse_loss_layer_023": 0.080566, "value_mse_loss_layer_024": 0.07959, "value_mse_loss_layer_025": 0.098633, "value_mse_loss_layer_026": 0.073242, "value_mse_loss_layer_027": 0.103516, "value_mse_loss_layer_028": 0.094238, "value_mse_loss_layer_029": 0.150391, "value_mse_loss_layer_030": 0.132812, "value_mse_loss_layer_031": 0.178711, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 0.000111, "vq_loss_layer_005": 0.000154, "vq_loss_layer_006": 0.000372, "vq_loss_layer_007": 0.000387, "vq_loss_layer_008": 0.000359, "vq_loss_layer_009": 0.000483, "vq_loss_layer_010": 0.000399, "vq_loss_layer_011": 0.000463, "vq_loss_layer_012": 0.000923, "vq_loss_layer_013": 0.000576, "vq_loss_layer_014": 0.000771, "vq_loss_layer_015": 0.00074, "vq_loss_layer_016": 0.000923, "vq_loss_layer_017": 0.000591, "vq_loss_layer_018": 0.000404, "vq_loss_layer_019": 0.000357, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.001068, "vq_loss_layer_022": 0.000422, "vq_loss_layer_023": 0.000683, "vq_loss_layer_024": 0.000511, "vq_loss_layer_025": 0.000614, "vq_loss_layer_026": 0.000778, "vq_loss_layer_027": 0.00103, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.002396, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.010254 }, { "ce_loss": 2.321495, "epoch": 0.00081, "grad_norm": 0.005391918122768402, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.066638, "kv_vq_loss": 0.001118, "learning_rate": 0.0007271212547196623, "loss": 0.067725, "step": 810, "value_mse_loss_layer_000": 0.001236, "value_mse_loss_layer_001": 0.003326, "value_mse_loss_layer_002": 0.013977, "value_mse_loss_layer_003": 0.021118, "value_mse_loss_layer_004": 0.020508, "value_mse_loss_layer_005": 0.018799, "value_mse_loss_layer_006": 0.021973, "value_mse_loss_layer_007": 0.025024, "value_mse_loss_layer_008": 0.028564, "value_mse_loss_layer_009": 0.037354, "value_mse_loss_layer_010": 0.032715, "value_mse_loss_layer_011": 0.035889, "value_mse_loss_layer_012": 0.036865, "value_mse_loss_layer_013": 0.036865, "value_mse_loss_layer_014": 0.039307, "value_mse_loss_layer_015": 0.039062, "value_mse_loss_layer_016": 0.036133, "value_mse_loss_layer_017": 0.040771, "value_mse_loss_layer_018": 0.040771, "value_mse_loss_layer_019": 0.045654, "value_mse_loss_layer_020": 0.049072, "value_mse_loss_layer_021": 0.061035, "value_mse_loss_layer_022": 0.055664, "value_mse_loss_layer_023": 0.069336, "value_mse_loss_layer_024": 0.067383, "value_mse_loss_layer_025": 0.094727, "value_mse_loss_layer_026": 0.073242, "value_mse_loss_layer_027": 0.094238, "value_mse_loss_layer_028": 0.091309, "value_mse_loss_layer_029": 0.134766, "value_mse_loss_layer_030": 0.122559, "value_mse_loss_layer_031": 0.165039, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 0.000177, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000219, "vq_loss_layer_007": 0.000349, "vq_loss_layer_008": 0.000332, "vq_loss_layer_009": 0.000483, "vq_loss_layer_010": 0.000414, "vq_loss_layer_011": 0.000458, "vq_loss_layer_012": 0.000793, "vq_loss_layer_013": 0.000568, "vq_loss_layer_014": 0.00074, "vq_loss_layer_015": 0.000668, "vq_loss_layer_016": 0.00071, "vq_loss_layer_017": 0.00066, "vq_loss_layer_018": 0.000401, "vq_loss_layer_019": 0.00028, "vq_loss_layer_020": 0.000362, "vq_loss_layer_021": 0.000732, "vq_loss_layer_022": 0.000429, "vq_loss_layer_023": 0.000637, "vq_loss_layer_024": 0.000383, "vq_loss_layer_025": 0.000614, "vq_loss_layer_026": 0.000912, "vq_loss_layer_027": 0.000824, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.002197, "vq_loss_layer_030": 0.003342, "vq_loss_layer_031": 0.009155 }, { "ce_loss": 2.292096, "epoch": 0.00082, "grad_norm": 0.007640261203050613, "key_mse_loss_layer_000": 0.003754, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.066321, "kv_vq_loss": 0.00106, "learning_rate": 0.0007284534630959291, "loss": 0.067371, "step": 820, "value_mse_loss_layer_000": 0.001205, "value_mse_loss_layer_001": 0.003296, "value_mse_loss_layer_002": 0.013306, "value_mse_loss_layer_003": 0.02063, "value_mse_loss_layer_004": 0.019165, "value_mse_loss_layer_005": 0.020508, "value_mse_loss_layer_006": 0.021606, "value_mse_loss_layer_007": 0.025269, "value_mse_loss_layer_008": 0.029297, "value_mse_loss_layer_009": 0.038086, "value_mse_loss_layer_010": 0.032959, "value_mse_loss_layer_011": 0.034912, "value_mse_loss_layer_012": 0.035156, "value_mse_loss_layer_013": 0.035645, "value_mse_loss_layer_014": 0.039795, "value_mse_loss_layer_015": 0.038818, "value_mse_loss_layer_016": 0.036133, "value_mse_loss_layer_017": 0.037842, "value_mse_loss_layer_018": 0.039795, "value_mse_loss_layer_019": 0.05127, "value_mse_loss_layer_020": 0.049072, "value_mse_loss_layer_021": 0.064941, "value_mse_loss_layer_022": 0.056396, "value_mse_loss_layer_023": 0.064453, "value_mse_loss_layer_024": 0.074707, "value_mse_loss_layer_025": 0.094238, "value_mse_loss_layer_026": 0.074219, "value_mse_loss_layer_027": 0.12207, "value_mse_loss_layer_028": 0.103516, "value_mse_loss_layer_029": 0.149414, "value_mse_loss_layer_030": 0.144531, "value_mse_loss_layer_031": 0.170898, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.000184, "vq_loss_layer_006": 0.000252, "vq_loss_layer_007": 0.000383, "vq_loss_layer_008": 0.000387, "vq_loss_layer_009": 0.000542, "vq_loss_layer_010": 0.000414, "vq_loss_layer_011": 0.000488, "vq_loss_layer_012": 0.000744, "vq_loss_layer_013": 0.000549, "vq_loss_layer_014": 0.000729, "vq_loss_layer_015": 0.000778, "vq_loss_layer_016": 0.000717, "vq_loss_layer_017": 0.000587, "vq_loss_layer_018": 0.00034, "vq_loss_layer_019": 0.000402, "vq_loss_layer_020": 0.000353, "vq_loss_layer_021": 0.000771, "vq_loss_layer_022": 0.000408, "vq_loss_layer_023": 0.000414, "vq_loss_layer_024": 0.000423, "vq_loss_layer_025": 0.000576, "vq_loss_layer_026": 0.000759, "vq_loss_layer_027": 0.00164, "vq_loss_layer_028": 0.001671, "vq_loss_layer_029": 0.002731, "vq_loss_layer_030": 0.004761, "vq_loss_layer_031": 0.009766 }, { "ce_loss": 2.255573, "epoch": 0.00083, "grad_norm": 0.005882002413272858, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.066528, "kv_vq_loss": 0.001058, "learning_rate": 0.0007297695230940183, "loss": 0.067578, "step": 830, "value_mse_loss_layer_000": 0.001259, "value_mse_loss_layer_001": 0.003326, "value_mse_loss_layer_002": 0.013611, "value_mse_loss_layer_003": 0.025879, "value_mse_loss_layer_004": 0.019165, "value_mse_loss_layer_005": 0.019409, "value_mse_loss_layer_006": 0.02356, "value_mse_loss_layer_007": 0.025635, "value_mse_loss_layer_008": 0.028564, "value_mse_loss_layer_009": 0.037598, "value_mse_loss_layer_010": 0.036621, "value_mse_loss_layer_011": 0.036621, "value_mse_loss_layer_012": 0.037598, "value_mse_loss_layer_013": 0.037842, "value_mse_loss_layer_014": 0.041748, "value_mse_loss_layer_015": 0.041748, "value_mse_loss_layer_016": 0.037598, "value_mse_loss_layer_017": 0.040039, "value_mse_loss_layer_018": 0.039062, "value_mse_loss_layer_019": 0.046875, "value_mse_loss_layer_020": 0.050049, "value_mse_loss_layer_021": 0.060791, "value_mse_loss_layer_022": 0.052734, "value_mse_loss_layer_023": 0.062988, "value_mse_loss_layer_024": 0.068359, "value_mse_loss_layer_025": 0.090332, "value_mse_loss_layer_026": 0.072266, "value_mse_loss_layer_027": 0.094727, "value_mse_loss_layer_028": 0.088379, "value_mse_loss_layer_029": 0.133789, "value_mse_loss_layer_030": 0.120605, "value_mse_loss_layer_031": 0.169922, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 6e-05, "vq_loss_layer_004": 0.000117, "vq_loss_layer_005": 0.000158, "vq_loss_layer_006": 0.000309, "vq_loss_layer_007": 0.000395, "vq_loss_layer_008": 0.000366, "vq_loss_layer_009": 0.000465, "vq_loss_layer_010": 0.000633, "vq_loss_layer_011": 0.000504, "vq_loss_layer_012": 0.000771, "vq_loss_layer_013": 0.000618, "vq_loss_layer_014": 0.000793, "vq_loss_layer_015": 0.000858, "vq_loss_layer_016": 0.000809, "vq_loss_layer_017": 0.000767, "vq_loss_layer_018": 0.000397, "vq_loss_layer_019": 0.000359, "vq_loss_layer_020": 0.000507, "vq_loss_layer_021": 0.000916, "vq_loss_layer_022": 0.000473, "vq_loss_layer_023": 0.00061, "vq_loss_layer_024": 0.000622, "vq_loss_layer_025": 0.000721, "vq_loss_layer_026": 0.001144, "vq_loss_layer_027": 0.001137, "vq_loss_layer_028": 0.001434, "vq_loss_layer_029": 0.002472, "vq_loss_layer_030": 0.003891, "vq_loss_layer_031": 0.011414 }, { "ce_loss": 2.256329, "epoch": 0.00084, "grad_norm": 0.005455149337649345, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.066028, "kv_vq_loss": 0.001054, "learning_rate": 0.0007310698215154703, "loss": 0.067065, "step": 840, "value_mse_loss_layer_000": 0.001205, "value_mse_loss_layer_001": 0.003281, "value_mse_loss_layer_002": 0.013245, "value_mse_loss_layer_003": 0.020752, "value_mse_loss_layer_004": 0.019287, "value_mse_loss_layer_005": 0.018921, "value_mse_loss_layer_006": 0.021973, "value_mse_loss_layer_007": 0.024536, "value_mse_loss_layer_008": 0.028564, "value_mse_loss_layer_009": 0.035645, "value_mse_loss_layer_010": 0.032471, "value_mse_loss_layer_011": 0.0354, "value_mse_loss_layer_012": 0.040039, "value_mse_loss_layer_013": 0.036133, "value_mse_loss_layer_014": 0.039551, "value_mse_loss_layer_015": 0.039307, "value_mse_loss_layer_016": 0.038818, "value_mse_loss_layer_017": 0.039307, "value_mse_loss_layer_018": 0.045898, "value_mse_loss_layer_019": 0.046631, "value_mse_loss_layer_020": 0.047852, "value_mse_loss_layer_021": 0.063477, "value_mse_loss_layer_022": 0.062988, "value_mse_loss_layer_023": 0.066406, "value_mse_loss_layer_024": 0.072754, "value_mse_loss_layer_025": 0.094238, "value_mse_loss_layer_026": 0.074707, "value_mse_loss_layer_027": 0.099121, "value_mse_loss_layer_028": 0.092285, "value_mse_loss_layer_029": 0.140625, "value_mse_loss_layer_030": 0.125977, "value_mse_loss_layer_031": 0.170898, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 0.000123, "vq_loss_layer_005": 0.000129, "vq_loss_layer_006": 0.000242, "vq_loss_layer_007": 0.000336, "vq_loss_layer_008": 0.00036, "vq_loss_layer_009": 0.000414, "vq_loss_layer_010": 0.000423, "vq_loss_layer_011": 0.000496, "vq_loss_layer_012": 0.001015, "vq_loss_layer_013": 0.000595, "vq_loss_layer_014": 0.000725, "vq_loss_layer_015": 0.000679, "vq_loss_layer_016": 0.000862, "vq_loss_layer_017": 0.000664, "vq_loss_layer_018": 0.000504, "vq_loss_layer_019": 0.000341, "vq_loss_layer_020": 0.000328, "vq_loss_layer_021": 0.000835, "vq_loss_layer_022": 0.000618, "vq_loss_layer_023": 0.000538, "vq_loss_layer_024": 0.000553, "vq_loss_layer_025": 0.000744, "vq_loss_layer_026": 0.001015, "vq_loss_layer_027": 0.00106, "vq_loss_layer_028": 0.001457, "vq_loss_layer_029": 0.002808, "vq_loss_layer_030": 0.003891, "vq_loss_layer_031": 0.011475 }, { "ce_loss": 2.328474, "epoch": 0.00085, "grad_norm": 0.0076435464434325695, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.066028, "kv_vq_loss": 0.001069, "learning_rate": 0.0007323547314285731, "loss": 0.067084, "step": 850, "value_mse_loss_layer_000": 0.001236, "value_mse_loss_layer_001": 0.003296, "value_mse_loss_layer_002": 0.013489, "value_mse_loss_layer_003": 0.021973, "value_mse_loss_layer_004": 0.020142, "value_mse_loss_layer_005": 0.020264, "value_mse_loss_layer_006": 0.022827, "value_mse_loss_layer_007": 0.025757, "value_mse_loss_layer_008": 0.028687, "value_mse_loss_layer_009": 0.038574, "value_mse_loss_layer_010": 0.033203, "value_mse_loss_layer_011": 0.036133, "value_mse_loss_layer_012": 0.036865, "value_mse_loss_layer_013": 0.038574, "value_mse_loss_layer_014": 0.040039, "value_mse_loss_layer_015": 0.040771, "value_mse_loss_layer_016": 0.037109, "value_mse_loss_layer_017": 0.041016, "value_mse_loss_layer_018": 0.040771, "value_mse_loss_layer_019": 0.047607, "value_mse_loss_layer_020": 0.04834, "value_mse_loss_layer_021": 0.05835, "value_mse_loss_layer_022": 0.054932, "value_mse_loss_layer_023": 0.089355, "value_mse_loss_layer_024": 0.078125, "value_mse_loss_layer_025": 0.086914, "value_mse_loss_layer_026": 0.070312, "value_mse_loss_layer_027": 0.089844, "value_mse_loss_layer_028": 0.094727, "value_mse_loss_layer_029": 0.131836, "value_mse_loss_layer_030": 0.122559, "value_mse_loss_layer_031": 0.168945, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 2.7e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 0.000152, "vq_loss_layer_005": 0.000172, "vq_loss_layer_006": 0.00028, "vq_loss_layer_007": 0.000383, "vq_loss_layer_008": 0.000366, "vq_loss_layer_009": 0.000523, "vq_loss_layer_010": 0.000427, "vq_loss_layer_011": 0.000519, "vq_loss_layer_012": 0.000782, "vq_loss_layer_013": 0.00066, "vq_loss_layer_014": 0.000763, "vq_loss_layer_015": 0.000805, "vq_loss_layer_016": 0.000782, "vq_loss_layer_017": 0.000832, "vq_loss_layer_018": 0.000538, "vq_loss_layer_019": 0.000427, "vq_loss_layer_020": 0.000427, "vq_loss_layer_021": 0.000862, "vq_loss_layer_022": 0.000526, "vq_loss_layer_023": 0.001305, "vq_loss_layer_024": 0.000725, "vq_loss_layer_025": 0.000782, "vq_loss_layer_026": 0.00103, "vq_loss_layer_027": 0.00103, "vq_loss_layer_028": 0.001778, "vq_loss_layer_029": 0.002777, "vq_loss_layer_030": 0.004425, "vq_loss_layer_031": 0.01239 }, { "ce_loss": 2.303784, "epoch": 0.00086, "grad_norm": 0.00844593159854412, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.065714, "kv_vq_loss": 0.001053, "learning_rate": 0.0007336246128108918, "loss": 0.066754, "step": 860, "value_mse_loss_layer_000": 0.001228, "value_mse_loss_layer_001": 0.003296, "value_mse_loss_layer_002": 0.013123, "value_mse_loss_layer_003": 0.021362, "value_mse_loss_layer_004": 0.018311, "value_mse_loss_layer_005": 0.018433, "value_mse_loss_layer_006": 0.021729, "value_mse_loss_layer_007": 0.025146, "value_mse_loss_layer_008": 0.028809, "value_mse_loss_layer_009": 0.03833, "value_mse_loss_layer_010": 0.034424, "value_mse_loss_layer_011": 0.036621, "value_mse_loss_layer_012": 0.037598, "value_mse_loss_layer_013": 0.03833, "value_mse_loss_layer_014": 0.042725, "value_mse_loss_layer_015": 0.042725, "value_mse_loss_layer_016": 0.03833, "value_mse_loss_layer_017": 0.040039, "value_mse_loss_layer_018": 0.040283, "value_mse_loss_layer_019": 0.04834, "value_mse_loss_layer_020": 0.04834, "value_mse_loss_layer_021": 0.06543, "value_mse_loss_layer_022": 0.054443, "value_mse_loss_layer_023": 0.068359, "value_mse_loss_layer_024": 0.073242, "value_mse_loss_layer_025": 0.092773, "value_mse_loss_layer_026": 0.069336, "value_mse_loss_layer_027": 0.119629, "value_mse_loss_layer_028": 0.09082, "value_mse_loss_layer_029": 0.162109, "value_mse_loss_layer_030": 0.125977, "value_mse_loss_layer_031": 0.163086, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.000118, "vq_loss_layer_005": 0.000139, "vq_loss_layer_006": 0.00023, "vq_loss_layer_007": 0.000357, "vq_loss_layer_008": 0.000355, "vq_loss_layer_009": 0.000479, "vq_loss_layer_010": 0.000425, "vq_loss_layer_011": 0.000486, "vq_loss_layer_012": 0.000782, "vq_loss_layer_013": 0.000626, "vq_loss_layer_014": 0.000782, "vq_loss_layer_015": 0.000763, "vq_loss_layer_016": 0.000809, "vq_loss_layer_017": 0.000637, "vq_loss_layer_018": 0.00041, "vq_loss_layer_019": 0.000353, "vq_loss_layer_020": 0.000364, "vq_loss_layer_021": 0.000984, "vq_loss_layer_022": 0.000422, "vq_loss_layer_023": 0.000599, "vq_loss_layer_024": 0.000542, "vq_loss_layer_025": 0.000671, "vq_loss_layer_026": 0.000851, "vq_loss_layer_027": 0.001678, "vq_loss_layer_028": 0.001244, "vq_loss_layer_029": 0.003098, "vq_loss_layer_030": 0.00383, "vq_loss_layer_031": 0.009644 }, { "ce_loss": 2.296472, "epoch": 0.00087, "grad_norm": 0.005352220498025417, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.065961, "kv_vq_loss": 0.00106, "learning_rate": 0.0007348798131546545, "loss": 0.067017, "step": 870, "value_mse_loss_layer_000": 0.001228, "value_mse_loss_layer_001": 0.003265, "value_mse_loss_layer_002": 0.013123, "value_mse_loss_layer_003": 0.022705, "value_mse_loss_layer_004": 0.019409, "value_mse_loss_layer_005": 0.02002, "value_mse_loss_layer_006": 0.021973, "value_mse_loss_layer_007": 0.025513, "value_mse_loss_layer_008": 0.029419, "value_mse_loss_layer_009": 0.03833, "value_mse_loss_layer_010": 0.034668, "value_mse_loss_layer_011": 0.037109, "value_mse_loss_layer_012": 0.046143, "value_mse_loss_layer_013": 0.038574, "value_mse_loss_layer_014": 0.042236, "value_mse_loss_layer_015": 0.041504, "value_mse_loss_layer_016": 0.038574, "value_mse_loss_layer_017": 0.039551, "value_mse_loss_layer_018": 0.038818, "value_mse_loss_layer_019": 0.046875, "value_mse_loss_layer_020": 0.05249, "value_mse_loss_layer_021": 0.064453, "value_mse_loss_layer_022": 0.056641, "value_mse_loss_layer_023": 0.063965, "value_mse_loss_layer_024": 0.069824, "value_mse_loss_layer_025": 0.084961, "value_mse_loss_layer_026": 0.071777, "value_mse_loss_layer_027": 0.104004, "value_mse_loss_layer_028": 0.090332, "value_mse_loss_layer_029": 0.149414, "value_mse_loss_layer_030": 0.123047, "value_mse_loss_layer_031": 0.170898, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 5.7e-05, "vq_loss_layer_004": 0.000127, "vq_loss_layer_005": 0.000179, "vq_loss_layer_006": 0.000273, "vq_loss_layer_007": 0.000422, "vq_loss_layer_008": 0.000435, "vq_loss_layer_009": 0.000515, "vq_loss_layer_010": 0.000511, "vq_loss_layer_011": 0.000568, "vq_loss_layer_012": 0.001602, "vq_loss_layer_013": 0.000668, "vq_loss_layer_014": 0.000912, "vq_loss_layer_015": 0.000874, "vq_loss_layer_016": 0.000904, "vq_loss_layer_017": 0.000702, "vq_loss_layer_018": 0.000408, "vq_loss_layer_019": 0.000345, "vq_loss_layer_020": 0.000553, "vq_loss_layer_021": 0.001465, "vq_loss_layer_022": 0.000629, "vq_loss_layer_023": 0.000668, "vq_loss_layer_024": 0.000523, "vq_loss_layer_025": 0.000706, "vq_loss_layer_026": 0.001045, "vq_loss_layer_027": 0.001419, "vq_loss_layer_028": 0.001381, "vq_loss_layer_029": 0.00267, "vq_loss_layer_030": 0.003525, "vq_loss_layer_031": 0.01062 }, { "ce_loss": 2.337931, "epoch": 0.00088, "grad_norm": 0.0077569326385855675, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.041504, "key_mse_loss_layer_005": 0.054688, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.065588, "kv_vq_loss": 0.001046, "learning_rate": 0.0007361206680375421, "loss": 0.066632, "step": 880, "value_mse_loss_layer_000": 0.001213, "value_mse_loss_layer_001": 0.003204, "value_mse_loss_layer_002": 0.012939, "value_mse_loss_layer_003": 0.020386, "value_mse_loss_layer_004": 0.02063, "value_mse_loss_layer_005": 0.019531, "value_mse_loss_layer_006": 0.021973, "value_mse_loss_layer_007": 0.026245, "value_mse_loss_layer_008": 0.030518, "value_mse_loss_layer_009": 0.036865, "value_mse_loss_layer_010": 0.033203, "value_mse_loss_layer_011": 0.0354, "value_mse_loss_layer_012": 0.036865, "value_mse_loss_layer_013": 0.036865, "value_mse_loss_layer_014": 0.038574, "value_mse_loss_layer_015": 0.038086, "value_mse_loss_layer_016": 0.037354, "value_mse_loss_layer_017": 0.039062, "value_mse_loss_layer_018": 0.036865, "value_mse_loss_layer_019": 0.044189, "value_mse_loss_layer_020": 0.051514, "value_mse_loss_layer_021": 0.060791, "value_mse_loss_layer_022": 0.056152, "value_mse_loss_layer_023": 0.062988, "value_mse_loss_layer_024": 0.067383, "value_mse_loss_layer_025": 0.092773, "value_mse_loss_layer_026": 0.120605, "value_mse_loss_layer_027": 0.098633, "value_mse_loss_layer_028": 0.098633, "value_mse_loss_layer_029": 0.138672, "value_mse_loss_layer_030": 0.121094, "value_mse_loss_layer_031": 0.164062, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 5.8e-05, "vq_loss_layer_004": 0.00014, "vq_loss_layer_005": 0.000148, "vq_loss_layer_006": 0.000211, "vq_loss_layer_007": 0.000359, "vq_loss_layer_008": 0.000465, "vq_loss_layer_009": 0.000458, "vq_loss_layer_010": 0.0005, "vq_loss_layer_011": 0.000507, "vq_loss_layer_012": 0.000778, "vq_loss_layer_013": 0.000614, "vq_loss_layer_014": 0.000782, "vq_loss_layer_015": 0.00074, "vq_loss_layer_016": 0.000908, "vq_loss_layer_017": 0.000656, "vq_loss_layer_018": 0.000309, "vq_loss_layer_019": 0.000322, "vq_loss_layer_020": 0.000572, "vq_loss_layer_021": 0.000816, "vq_loss_layer_022": 0.000523, "vq_loss_layer_023": 0.000568, "vq_loss_layer_024": 0.000469, "vq_loss_layer_025": 0.000729, "vq_loss_layer_026": 0.00293, "vq_loss_layer_027": 0.001076, "vq_loss_layer_028": 0.001816, "vq_loss_layer_029": 0.002655, "vq_loss_layer_030": 0.003357, "vq_loss_layer_031": 0.010559 }, { "ce_loss": 2.258958, "epoch": 0.00089, "grad_norm": 0.005560380406677723, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.065887, "kv_vq_loss": 0.001072, "learning_rate": 0.0007373475016612281, "loss": 0.066913, "step": 890, "value_mse_loss_layer_000": 0.00119, "value_mse_loss_layer_001": 0.003235, "value_mse_loss_layer_002": 0.013, "value_mse_loss_layer_003": 0.022461, "value_mse_loss_layer_004": 0.019897, "value_mse_loss_layer_005": 0.019531, "value_mse_loss_layer_006": 0.022095, "value_mse_loss_layer_007": 0.025269, "value_mse_loss_layer_008": 0.028931, "value_mse_loss_layer_009": 0.037842, "value_mse_loss_layer_010": 0.032715, "value_mse_loss_layer_011": 0.035156, "value_mse_loss_layer_012": 0.036865, "value_mse_loss_layer_013": 0.038086, "value_mse_loss_layer_014": 0.041504, "value_mse_loss_layer_015": 0.040039, "value_mse_loss_layer_016": 0.036377, "value_mse_loss_layer_017": 0.039307, "value_mse_loss_layer_018": 0.039307, "value_mse_loss_layer_019": 0.048584, "value_mse_loss_layer_020": 0.047363, "value_mse_loss_layer_021": 0.058838, "value_mse_loss_layer_022": 0.053223, "value_mse_loss_layer_023": 0.070801, "value_mse_loss_layer_024": 0.07373, "value_mse_loss_layer_025": 0.091309, "value_mse_loss_layer_026": 0.090332, "value_mse_loss_layer_027": 0.104004, "value_mse_loss_layer_028": 0.102539, "value_mse_loss_layer_029": 0.158203, "value_mse_loss_layer_030": 0.136719, "value_mse_loss_layer_031": 0.176758, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 3.6e-05, "vq_loss_layer_002": 3.9e-05, "vq_loss_layer_003": 8.2e-05, "vq_loss_layer_004": 0.000141, "vq_loss_layer_005": 0.000152, "vq_loss_layer_006": 0.000282, "vq_loss_layer_007": 0.000357, "vq_loss_layer_008": 0.000406, "vq_loss_layer_009": 0.000519, "vq_loss_layer_010": 0.000511, "vq_loss_layer_011": 0.000475, "vq_loss_layer_012": 0.000805, "vq_loss_layer_013": 0.000626, "vq_loss_layer_014": 0.000782, "vq_loss_layer_015": 0.000793, "vq_loss_layer_016": 0.000698, "vq_loss_layer_017": 0.000561, "vq_loss_layer_018": 0.000372, "vq_loss_layer_019": 0.000347, "vq_loss_layer_020": 0.00032, "vq_loss_layer_021": 0.000679, "vq_loss_layer_022": 0.000444, "vq_loss_layer_023": 0.000774, "vq_loss_layer_024": 0.000557, "vq_loss_layer_025": 0.000748, "vq_loss_layer_026": 0.001503, "vq_loss_layer_027": 0.001137, "vq_loss_layer_028": 0.002716, "vq_loss_layer_029": 0.004822, "vq_loss_layer_030": 0.003998, "vq_loss_layer_031": 0.014465 }, { "ce_loss": 2.272209, "epoch": 0.0009, "grad_norm": 0.005574071779847145, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.07373, "key_mse_loss_layer_031": 0.05835, "kv_mse_loss": 0.065155, "kv_vq_loss": 0.001024, "learning_rate": 0.000738560627359831, "loss": 0.066187, "step": 900, "value_mse_loss_layer_000": 0.00119, "value_mse_loss_layer_001": 0.003174, "value_mse_loss_layer_002": 0.012634, "value_mse_loss_layer_003": 0.021729, "value_mse_loss_layer_004": 0.019287, "value_mse_loss_layer_005": 0.019409, "value_mse_loss_layer_006": 0.021362, "value_mse_loss_layer_007": 0.026489, "value_mse_loss_layer_008": 0.028687, "value_mse_loss_layer_009": 0.040771, "value_mse_loss_layer_010": 0.034424, "value_mse_loss_layer_011": 0.035156, "value_mse_loss_layer_012": 0.036621, "value_mse_loss_layer_013": 0.036865, "value_mse_loss_layer_014": 0.040039, "value_mse_loss_layer_015": 0.040283, "value_mse_loss_layer_016": 0.038086, "value_mse_loss_layer_017": 0.039062, "value_mse_loss_layer_018": 0.040283, "value_mse_loss_layer_019": 0.046631, "value_mse_loss_layer_020": 0.052979, "value_mse_loss_layer_021": 0.063965, "value_mse_loss_layer_022": 0.051758, "value_mse_loss_layer_023": 0.067871, "value_mse_loss_layer_024": 0.068848, "value_mse_loss_layer_025": 0.085449, "value_mse_loss_layer_026": 0.07373, "value_mse_loss_layer_027": 0.098633, "value_mse_loss_layer_028": 0.089355, "value_mse_loss_layer_029": 0.134766, "value_mse_loss_layer_030": 0.12207, "value_mse_loss_layer_031": 0.169922, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 2.6e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 7.2e-05, "vq_loss_layer_004": 0.000127, "vq_loss_layer_005": 0.000153, "vq_loss_layer_006": 0.000226, "vq_loss_layer_007": 0.000416, "vq_loss_layer_008": 0.000404, "vq_loss_layer_009": 0.00071, "vq_loss_layer_010": 0.000507, "vq_loss_layer_011": 0.000504, "vq_loss_layer_012": 0.000782, "vq_loss_layer_013": 0.000626, "vq_loss_layer_014": 0.000824, "vq_loss_layer_015": 0.000797, "vq_loss_layer_016": 0.000935, "vq_loss_layer_017": 0.000713, "vq_loss_layer_018": 0.000448, "vq_loss_layer_019": 0.000332, "vq_loss_layer_020": 0.000486, "vq_loss_layer_021": 0.00103, "vq_loss_layer_022": 0.000454, "vq_loss_layer_023": 0.000668, "vq_loss_layer_024": 0.000515, "vq_loss_layer_025": 0.000732, "vq_loss_layer_026": 0.001053, "vq_loss_layer_027": 0.001236, "vq_loss_layer_028": 0.001633, "vq_loss_layer_029": 0.003021, "vq_loss_layer_030": 0.003998, "vq_loss_layer_031": 0.013428 }, { "ce_loss": 2.353723, "epoch": 0.00091, "grad_norm": 0.005696074105799198, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.092773, "key_mse_loss_layer_009": 0.099609, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.137695, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.12207, "key_mse_loss_layer_016": 0.115234, "key_mse_loss_layer_017": 0.113281, "key_mse_loss_layer_018": 0.121582, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.108398, "key_mse_loss_layer_023": 0.104492, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.093262, "key_mse_loss_layer_027": 0.089844, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.065912, "kv_vq_loss": 0.001011, "learning_rate": 0.0007397603480802733, "loss": 0.066956, "step": 910, "value_mse_loss_layer_000": 0.00116, "value_mse_loss_layer_001": 0.003296, "value_mse_loss_layer_002": 0.014038, "value_mse_loss_layer_003": 0.021362, "value_mse_loss_layer_004": 0.019897, "value_mse_loss_layer_005": 0.019653, "value_mse_loss_layer_006": 0.023193, "value_mse_loss_layer_007": 0.025757, "value_mse_loss_layer_008": 0.029175, "value_mse_loss_layer_009": 0.036133, "value_mse_loss_layer_010": 0.033936, "value_mse_loss_layer_011": 0.036133, "value_mse_loss_layer_012": 0.03833, "value_mse_loss_layer_013": 0.038086, "value_mse_loss_layer_014": 0.037842, "value_mse_loss_layer_015": 0.036133, "value_mse_loss_layer_016": 0.037598, "value_mse_loss_layer_017": 0.035645, "value_mse_loss_layer_018": 0.038574, "value_mse_loss_layer_019": 0.042969, "value_mse_loss_layer_020": 0.047119, "value_mse_loss_layer_021": 0.06543, "value_mse_loss_layer_022": 0.049805, "value_mse_loss_layer_023": 0.057129, "value_mse_loss_layer_024": 0.075684, "value_mse_loss_layer_025": 0.084961, "value_mse_loss_layer_026": 0.066406, "value_mse_loss_layer_027": 0.086426, "value_mse_loss_layer_028": 0.084473, "value_mse_loss_layer_029": 0.122559, "value_mse_loss_layer_030": 0.116211, "value_mse_loss_layer_031": 0.165039, "vq_loss_layer_000": 1.8e-05, "vq_loss_layer_001": 3.8e-05, "vq_loss_layer_002": 4.9e-05, "vq_loss_layer_003": 6.6e-05, "vq_loss_layer_004": 0.000156, "vq_loss_layer_005": 0.000179, "vq_loss_layer_006": 0.000349, "vq_loss_layer_007": 0.00042, "vq_loss_layer_008": 0.000519, "vq_loss_layer_009": 0.000557, "vq_loss_layer_010": 0.000641, "vq_loss_layer_011": 0.000698, "vq_loss_layer_012": 0.001045, "vq_loss_layer_013": 0.000793, "vq_loss_layer_014": 0.000885, "vq_loss_layer_015": 0.00071, "vq_loss_layer_016": 0.001045, "vq_loss_layer_017": 0.00079, "vq_loss_layer_018": 0.00041, "vq_loss_layer_019": 0.000399, "vq_loss_layer_020": 0.000471, "vq_loss_layer_021": 0.001328, "vq_loss_layer_022": 0.000626, "vq_loss_layer_023": 0.000687, "vq_loss_layer_024": 0.000923, "vq_loss_layer_025": 0.00119, "vq_loss_layer_026": 0.001297, "vq_loss_layer_027": 0.001381, "vq_loss_layer_028": 0.002808, "vq_loss_layer_029": 0.003128, "vq_loss_layer_030": 0.00412, "vq_loss_layer_031": 0.014282 }, { "ce_loss": 2.31368, "epoch": 0.00092, "grad_norm": 0.005021523684263229, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.065262, "kv_vq_loss": 0.00101, "learning_rate": 0.0007409469568363888, "loss": 0.066296, "step": 920, "value_mse_loss_layer_000": 0.001205, "value_mse_loss_layer_001": 0.003265, "value_mse_loss_layer_002": 0.013184, "value_mse_loss_layer_003": 0.019653, "value_mse_loss_layer_004": 0.018188, "value_mse_loss_layer_005": 0.017822, "value_mse_loss_layer_006": 0.020874, "value_mse_loss_layer_007": 0.023804, "value_mse_loss_layer_008": 0.027832, "value_mse_loss_layer_009": 0.035889, "value_mse_loss_layer_010": 0.032471, "value_mse_loss_layer_011": 0.034668, "value_mse_loss_layer_012": 0.034424, "value_mse_loss_layer_013": 0.035645, "value_mse_loss_layer_014": 0.039307, "value_mse_loss_layer_015": 0.038574, "value_mse_loss_layer_016": 0.035889, "value_mse_loss_layer_017": 0.038574, "value_mse_loss_layer_018": 0.040527, "value_mse_loss_layer_019": 0.046143, "value_mse_loss_layer_020": 0.046875, "value_mse_loss_layer_021": 0.05957, "value_mse_loss_layer_022": 0.052002, "value_mse_loss_layer_023": 0.0625, "value_mse_loss_layer_024": 0.074219, "value_mse_loss_layer_025": 0.088379, "value_mse_loss_layer_026": 0.069336, "value_mse_loss_layer_027": 0.09082, "value_mse_loss_layer_028": 0.089355, "value_mse_loss_layer_029": 0.140625, "value_mse_loss_layer_030": 0.122559, "value_mse_loss_layer_031": 0.157227, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.000127, "vq_loss_layer_006": 0.000211, "vq_loss_layer_007": 0.000326, "vq_loss_layer_008": 0.000319, "vq_loss_layer_009": 0.000422, "vq_loss_layer_010": 0.000389, "vq_loss_layer_011": 0.000446, "vq_loss_layer_012": 0.00069, "vq_loss_layer_013": 0.000561, "vq_loss_layer_014": 0.000736, "vq_loss_layer_015": 0.000668, "vq_loss_layer_016": 0.000782, "vq_loss_layer_017": 0.00069, "vq_loss_layer_018": 0.000374, "vq_loss_layer_019": 0.000322, "vq_loss_layer_020": 0.000359, "vq_loss_layer_021": 0.000797, "vq_loss_layer_022": 0.000383, "vq_loss_layer_023": 0.000507, "vq_loss_layer_024": 0.000603, "vq_loss_layer_025": 0.000572, "vq_loss_layer_026": 0.000813, "vq_loss_layer_027": 0.000946, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.002579, "vq_loss_layer_030": 0.003128, "vq_loss_layer_031": 0.009094 }, { "ce_loss": 2.262648, "epoch": 0.00093, "grad_norm": 0.005108647048473358, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.065002, "kv_vq_loss": 0.001014, "learning_rate": 0.0007421207371384836, "loss": 0.066034, "step": 930, "value_mse_loss_layer_000": 0.001175, "value_mse_loss_layer_001": 0.003174, "value_mse_loss_layer_002": 0.012817, "value_mse_loss_layer_003": 0.02124, "value_mse_loss_layer_004": 0.018066, "value_mse_loss_layer_005": 0.019165, "value_mse_loss_layer_006": 0.020996, "value_mse_loss_layer_007": 0.024048, "value_mse_loss_layer_008": 0.027832, "value_mse_loss_layer_009": 0.039062, "value_mse_loss_layer_010": 0.03125, "value_mse_loss_layer_011": 0.033936, "value_mse_loss_layer_012": 0.040283, "value_mse_loss_layer_013": 0.035156, "value_mse_loss_layer_014": 0.039551, "value_mse_loss_layer_015": 0.039307, "value_mse_loss_layer_016": 0.035889, "value_mse_loss_layer_017": 0.038086, "value_mse_loss_layer_018": 0.040039, "value_mse_loss_layer_019": 0.050049, "value_mse_loss_layer_020": 0.047363, "value_mse_loss_layer_021": 0.059326, "value_mse_loss_layer_022": 0.059814, "value_mse_loss_layer_023": 0.063965, "value_mse_loss_layer_024": 0.068359, "value_mse_loss_layer_025": 0.085938, "value_mse_loss_layer_026": 0.070312, "value_mse_loss_layer_027": 0.093262, "value_mse_loss_layer_028": 0.089844, "value_mse_loss_layer_029": 0.141602, "value_mse_loss_layer_030": 0.121094, "value_mse_loss_layer_031": 0.15918, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.000173, "vq_loss_layer_006": 0.000228, "vq_loss_layer_007": 0.000355, "vq_loss_layer_008": 0.000341, "vq_loss_layer_009": 0.000652, "vq_loss_layer_010": 0.000359, "vq_loss_layer_011": 0.000414, "vq_loss_layer_012": 0.001167, "vq_loss_layer_013": 0.000595, "vq_loss_layer_014": 0.000717, "vq_loss_layer_015": 0.000736, "vq_loss_layer_016": 0.000755, "vq_loss_layer_017": 0.000568, "vq_loss_layer_018": 0.000381, "vq_loss_layer_019": 0.00033, "vq_loss_layer_020": 0.000353, "vq_loss_layer_021": 0.000751, "vq_loss_layer_022": 0.000549, "vq_loss_layer_023": 0.000576, "vq_loss_layer_024": 0.000446, "vq_loss_layer_025": 0.000549, "vq_loss_layer_026": 0.000828, "vq_loss_layer_027": 0.000977, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.002472, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.277213, "epoch": 0.00094, "grad_norm": 0.008053760975599289, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.065527, "kv_vq_loss": 0.001013, "learning_rate": 0.0007432819633999245, "loss": 0.066553, "step": 940, "value_mse_loss_layer_000": 0.001221, "value_mse_loss_layer_001": 0.003235, "value_mse_loss_layer_002": 0.013123, "value_mse_loss_layer_003": 0.020874, "value_mse_loss_layer_004": 0.018799, "value_mse_loss_layer_005": 0.02002, "value_mse_loss_layer_006": 0.022461, "value_mse_loss_layer_007": 0.025024, "value_mse_loss_layer_008": 0.029297, "value_mse_loss_layer_009": 0.039795, "value_mse_loss_layer_010": 0.034912, "value_mse_loss_layer_011": 0.039795, "value_mse_loss_layer_012": 0.037842, "value_mse_loss_layer_013": 0.039551, "value_mse_loss_layer_014": 0.040527, "value_mse_loss_layer_015": 0.040771, "value_mse_loss_layer_016": 0.036621, "value_mse_loss_layer_017": 0.041016, "value_mse_loss_layer_018": 0.037842, "value_mse_loss_layer_019": 0.04541, "value_mse_loss_layer_020": 0.048584, "value_mse_loss_layer_021": 0.062988, "value_mse_loss_layer_022": 0.052246, "value_mse_loss_layer_023": 0.060791, "value_mse_loss_layer_024": 0.066406, "value_mse_loss_layer_025": 0.094238, "value_mse_loss_layer_026": 0.068848, "value_mse_loss_layer_027": 0.085938, "value_mse_loss_layer_028": 0.093262, "value_mse_loss_layer_029": 0.151367, "value_mse_loss_layer_030": 0.115723, "value_mse_loss_layer_031": 0.166016, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.000129, "vq_loss_layer_005": 0.000203, "vq_loss_layer_006": 0.000269, "vq_loss_layer_007": 0.00036, "vq_loss_layer_008": 0.000439, "vq_loss_layer_009": 0.000591, "vq_loss_layer_010": 0.000462, "vq_loss_layer_011": 0.000721, "vq_loss_layer_012": 0.000854, "vq_loss_layer_013": 0.000809, "vq_loss_layer_014": 0.00082, "vq_loss_layer_015": 0.000751, "vq_loss_layer_016": 0.000786, "vq_loss_layer_017": 0.000725, "vq_loss_layer_018": 0.000408, "vq_loss_layer_019": 0.00032, "vq_loss_layer_020": 0.00046, "vq_loss_layer_021": 0.000973, "vq_loss_layer_022": 0.000519, "vq_loss_layer_023": 0.000565, "vq_loss_layer_024": 0.000519, "vq_loss_layer_025": 0.000816, "vq_loss_layer_026": 0.000992, "vq_loss_layer_027": 0.000957, "vq_loss_layer_028": 0.001457, "vq_loss_layer_029": 0.002777, "vq_loss_layer_030": 0.003189, "vq_loss_layer_031": 0.011169 }, { "ce_loss": 2.30359, "epoch": 0.00095, "grad_norm": 0.00603246595710516, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.064999, "kv_vq_loss": 0.000998, "learning_rate": 0.0007444309013222118, "loss": 0.065997, "step": 950, "value_mse_loss_layer_000": 0.001198, "value_mse_loss_layer_001": 0.003174, "value_mse_loss_layer_002": 0.012756, "value_mse_loss_layer_003": 0.019653, "value_mse_loss_layer_004": 0.018799, "value_mse_loss_layer_005": 0.018677, "value_mse_loss_layer_006": 0.021484, "value_mse_loss_layer_007": 0.024414, "value_mse_loss_layer_008": 0.027222, "value_mse_loss_layer_009": 0.035889, "value_mse_loss_layer_010": 0.031494, "value_mse_loss_layer_011": 0.03418, "value_mse_loss_layer_012": 0.035645, "value_mse_loss_layer_013": 0.036377, "value_mse_loss_layer_014": 0.037842, "value_mse_loss_layer_015": 0.038086, "value_mse_loss_layer_016": 0.037354, "value_mse_loss_layer_017": 0.037842, "value_mse_loss_layer_018": 0.03833, "value_mse_loss_layer_019": 0.043701, "value_mse_loss_layer_020": 0.045654, "value_mse_loss_layer_021": 0.056885, "value_mse_loss_layer_022": 0.050537, "value_mse_loss_layer_023": 0.061035, "value_mse_loss_layer_024": 0.08252, "value_mse_loss_layer_025": 0.088867, "value_mse_loss_layer_026": 0.06543, "value_mse_loss_layer_027": 0.087402, "value_mse_loss_layer_028": 0.083984, "value_mse_loss_layer_029": 0.125, "value_mse_loss_layer_030": 0.114746, "value_mse_loss_layer_031": 0.163086, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000132, "vq_loss_layer_005": 0.000159, "vq_loss_layer_006": 0.000259, "vq_loss_layer_007": 0.000353, "vq_loss_layer_008": 0.000357, "vq_loss_layer_009": 0.000465, "vq_loss_layer_010": 0.000404, "vq_loss_layer_011": 0.000452, "vq_loss_layer_012": 0.000774, "vq_loss_layer_013": 0.000599, "vq_loss_layer_014": 0.000706, "vq_loss_layer_015": 0.00074, "vq_loss_layer_016": 0.0009, "vq_loss_layer_017": 0.000671, "vq_loss_layer_018": 0.000469, "vq_loss_layer_019": 0.000366, "vq_loss_layer_020": 0.000401, "vq_loss_layer_021": 0.000893, "vq_loss_layer_022": 0.000511, "vq_loss_layer_023": 0.000633, "vq_loss_layer_024": 0.000896, "vq_loss_layer_025": 0.000813, "vq_loss_layer_026": 0.000992, "vq_loss_layer_027": 0.001083, "vq_loss_layer_028": 0.001335, "vq_loss_layer_029": 0.002304, "vq_loss_layer_030": 0.00325, "vq_loss_layer_031": 0.011169 }, { "ce_loss": 2.285325, "epoch": 0.00096, "grad_norm": 0.005272664595395327, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.064523, "kv_vq_loss": 0.000985, "learning_rate": 0.000745567808259892, "loss": 0.065527, "step": 960, "value_mse_loss_layer_000": 0.001175, "value_mse_loss_layer_001": 0.003174, "value_mse_loss_layer_002": 0.012512, "value_mse_loss_layer_003": 0.021606, "value_mse_loss_layer_004": 0.018433, "value_mse_loss_layer_005": 0.017822, "value_mse_loss_layer_006": 0.02063, "value_mse_loss_layer_007": 0.024658, "value_mse_loss_layer_008": 0.027588, "value_mse_loss_layer_009": 0.036621, "value_mse_loss_layer_010": 0.03125, "value_mse_loss_layer_011": 0.034668, "value_mse_loss_layer_012": 0.037109, "value_mse_loss_layer_013": 0.036133, "value_mse_loss_layer_014": 0.041748, "value_mse_loss_layer_015": 0.039551, "value_mse_loss_layer_016": 0.036133, "value_mse_loss_layer_017": 0.038574, "value_mse_loss_layer_018": 0.03833, "value_mse_loss_layer_019": 0.045654, "value_mse_loss_layer_020": 0.046875, "value_mse_loss_layer_021": 0.058838, "value_mse_loss_layer_022": 0.052979, "value_mse_loss_layer_023": 0.0625, "value_mse_loss_layer_024": 0.072266, "value_mse_loss_layer_025": 0.088867, "value_mse_loss_layer_026": 0.067871, "value_mse_loss_layer_027": 0.09375, "value_mse_loss_layer_028": 0.088379, "value_mse_loss_layer_029": 0.136719, "value_mse_loss_layer_030": 0.119141, "value_mse_loss_layer_031": 0.155273, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000122, "vq_loss_layer_005": 0.000128, "vq_loss_layer_006": 0.000201, "vq_loss_layer_007": 0.000349, "vq_loss_layer_008": 0.000307, "vq_loss_layer_009": 0.000448, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000418, "vq_loss_layer_012": 0.000851, "vq_loss_layer_013": 0.000546, "vq_loss_layer_014": 0.000713, "vq_loss_layer_015": 0.000751, "vq_loss_layer_016": 0.00074, "vq_loss_layer_017": 0.000565, "vq_loss_layer_018": 0.000345, "vq_loss_layer_019": 0.000355, "vq_loss_layer_020": 0.000351, "vq_loss_layer_021": 0.000717, "vq_loss_layer_022": 0.000376, "vq_loss_layer_023": 0.000473, "vq_loss_layer_024": 0.000422, "vq_loss_layer_025": 0.000584, "vq_loss_layer_026": 0.000694, "vq_loss_layer_027": 0.000927, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.002151, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.008484 }, { "ce_loss": 2.256732, "epoch": 0.00097, "grad_norm": 0.006470081862062216, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.065085, "kv_vq_loss": 0.001052, "learning_rate": 0.000746692933566561, "loss": 0.066132, "step": 970, "value_mse_loss_layer_000": 0.001175, "value_mse_loss_layer_001": 0.003159, "value_mse_loss_layer_002": 0.012756, "value_mse_loss_layer_003": 0.020508, "value_mse_loss_layer_004": 0.018188, "value_mse_loss_layer_005": 0.019775, "value_mse_loss_layer_006": 0.021851, "value_mse_loss_layer_007": 0.024414, "value_mse_loss_layer_008": 0.028198, "value_mse_loss_layer_009": 0.037109, "value_mse_loss_layer_010": 0.031982, "value_mse_loss_layer_011": 0.034424, "value_mse_loss_layer_012": 0.035156, "value_mse_loss_layer_013": 0.0354, "value_mse_loss_layer_014": 0.038574, "value_mse_loss_layer_015": 0.038574, "value_mse_loss_layer_016": 0.036377, "value_mse_loss_layer_017": 0.039062, "value_mse_loss_layer_018": 0.039795, "value_mse_loss_layer_019": 0.044678, "value_mse_loss_layer_020": 0.05127, "value_mse_loss_layer_021": 0.060791, "value_mse_loss_layer_022": 0.052979, "value_mse_loss_layer_023": 0.067871, "value_mse_loss_layer_024": 0.068848, "value_mse_loss_layer_025": 0.092773, "value_mse_loss_layer_026": 0.074219, "value_mse_loss_layer_027": 0.093262, "value_mse_loss_layer_028": 0.094238, "value_mse_loss_layer_029": 0.137695, "value_mse_loss_layer_030": 0.124023, "value_mse_loss_layer_031": 0.166992, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.8e-05, "vq_loss_layer_004": 0.000119, "vq_loss_layer_005": 0.000168, "vq_loss_layer_006": 0.000248, "vq_loss_layer_007": 0.000351, "vq_loss_layer_008": 0.000343, "vq_loss_layer_009": 0.000496, "vq_loss_layer_010": 0.00041, "vq_loss_layer_011": 0.000448, "vq_loss_layer_012": 0.000751, "vq_loss_layer_013": 0.000557, "vq_loss_layer_014": 0.00071, "vq_loss_layer_015": 0.000687, "vq_loss_layer_016": 0.000755, "vq_loss_layer_017": 0.000641, "vq_loss_layer_018": 0.000408, "vq_loss_layer_019": 0.000298, "vq_loss_layer_020": 0.000395, "vq_loss_layer_021": 0.00079, "vq_loss_layer_022": 0.000406, "vq_loss_layer_023": 0.000576, "vq_loss_layer_024": 0.000423, "vq_loss_layer_025": 0.00066, "vq_loss_layer_026": 0.000942, "vq_loss_layer_027": 0.000927, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.002365, "vq_loss_layer_030": 0.003357, "vq_loss_layer_031": 0.010132 }, { "ce_loss": 2.288932, "epoch": 0.00098, "grad_norm": 0.005517465528100729, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.113281, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.060547, "kv_mse_loss": 0.064352, "kv_vq_loss": 0.001008, "learning_rate": 0.0007478065189231236, "loss": 0.065375, "step": 980, "value_mse_loss_layer_000": 0.001228, "value_mse_loss_layer_001": 0.003174, "value_mse_loss_layer_002": 0.012756, "value_mse_loss_layer_003": 0.020142, "value_mse_loss_layer_004": 0.020386, "value_mse_loss_layer_005": 0.018677, "value_mse_loss_layer_006": 0.021484, "value_mse_loss_layer_007": 0.024414, "value_mse_loss_layer_008": 0.027466, "value_mse_loss_layer_009": 0.037109, "value_mse_loss_layer_010": 0.032471, "value_mse_loss_layer_011": 0.034668, "value_mse_loss_layer_012": 0.036621, "value_mse_loss_layer_013": 0.037354, "value_mse_loss_layer_014": 0.03833, "value_mse_loss_layer_015": 0.040283, "value_mse_loss_layer_016": 0.035156, "value_mse_loss_layer_017": 0.037598, "value_mse_loss_layer_018": 0.036621, "value_mse_loss_layer_019": 0.041992, "value_mse_loss_layer_020": 0.045654, "value_mse_loss_layer_021": 0.054199, "value_mse_loss_layer_022": 0.051025, "value_mse_loss_layer_023": 0.056641, "value_mse_loss_layer_024": 0.060059, "value_mse_loss_layer_025": 0.076172, "value_mse_loss_layer_026": 0.058838, "value_mse_loss_layer_027": 0.083496, "value_mse_loss_layer_028": 0.081543, "value_mse_loss_layer_029": 0.120117, "value_mse_loss_layer_030": 0.116699, "value_mse_loss_layer_031": 0.164062, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 3.2e-05, "vq_loss_layer_002": 4.4e-05, "vq_loss_layer_003": 7.8e-05, "vq_loss_layer_004": 0.000192, "vq_loss_layer_005": 0.000163, "vq_loss_layer_006": 0.00028, "vq_loss_layer_007": 0.000368, "vq_loss_layer_008": 0.000444, "vq_loss_layer_009": 0.000542, "vq_loss_layer_010": 0.000526, "vq_loss_layer_011": 0.000538, "vq_loss_layer_012": 0.000866, "vq_loss_layer_013": 0.00074, "vq_loss_layer_014": 0.000843, "vq_loss_layer_015": 0.000908, "vq_loss_layer_016": 0.000851, "vq_loss_layer_017": 0.000629, "vq_loss_layer_018": 0.00041, "vq_loss_layer_019": 0.000326, "vq_loss_layer_020": 0.000427, "vq_loss_layer_021": 0.000992, "vq_loss_layer_022": 0.000652, "vq_loss_layer_023": 0.00071, "vq_loss_layer_024": 0.000607, "vq_loss_layer_025": 0.000969, "vq_loss_layer_026": 0.001099, "vq_loss_layer_027": 0.001328, "vq_loss_layer_028": 0.001755, "vq_loss_layer_029": 0.003067, "vq_loss_layer_030": 0.006409, "vq_loss_layer_031": 0.014465 }, { "ce_loss": 2.283365, "epoch": 0.00099, "grad_norm": 0.005180981010198593, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.064606, "kv_vq_loss": 0.000995, "learning_rate": 0.0007489087986493875, "loss": 0.065631, "step": 990, "value_mse_loss_layer_000": 0.001167, "value_mse_loss_layer_001": 0.003159, "value_mse_loss_layer_002": 0.012451, "value_mse_loss_layer_003": 0.020142, "value_mse_loss_layer_004": 0.018677, "value_mse_loss_layer_005": 0.0177, "value_mse_loss_layer_006": 0.021118, "value_mse_loss_layer_007": 0.025024, "value_mse_loss_layer_008": 0.028564, "value_mse_loss_layer_009": 0.035889, "value_mse_loss_layer_010": 0.031128, "value_mse_loss_layer_011": 0.035889, "value_mse_loss_layer_012": 0.033936, "value_mse_loss_layer_013": 0.035889, "value_mse_loss_layer_014": 0.040039, "value_mse_loss_layer_015": 0.040283, "value_mse_loss_layer_016": 0.035645, "value_mse_loss_layer_017": 0.03833, "value_mse_loss_layer_018": 0.039551, "value_mse_loss_layer_019": 0.048828, "value_mse_loss_layer_020": 0.047119, "value_mse_loss_layer_021": 0.060059, "value_mse_loss_layer_022": 0.053223, "value_mse_loss_layer_023": 0.064453, "value_mse_loss_layer_024": 0.071289, "value_mse_loss_layer_025": 0.084473, "value_mse_loss_layer_026": 0.070312, "value_mse_loss_layer_027": 0.09082, "value_mse_loss_layer_028": 0.089355, "value_mse_loss_layer_029": 0.137695, "value_mse_loss_layer_030": 0.119629, "value_mse_loss_layer_031": 0.164062, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 0.00014, "vq_loss_layer_005": 0.000124, "vq_loss_layer_006": 0.000226, "vq_loss_layer_007": 0.000423, "vq_loss_layer_008": 0.000378, "vq_loss_layer_009": 0.000467, "vq_loss_layer_010": 0.00038, "vq_loss_layer_011": 0.000561, "vq_loss_layer_012": 0.000698, "vq_loss_layer_013": 0.00069, "vq_loss_layer_014": 0.000816, "vq_loss_layer_015": 0.000755, "vq_loss_layer_016": 0.000706, "vq_loss_layer_017": 0.000664, "vq_loss_layer_018": 0.000389, "vq_loss_layer_019": 0.000351, "vq_loss_layer_020": 0.000324, "vq_loss_layer_021": 0.000729, "vq_loss_layer_022": 0.000393, "vq_loss_layer_023": 0.000496, "vq_loss_layer_024": 0.000458, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.000782, "vq_loss_layer_027": 0.000828, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.002228, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.291077, "epoch": 0.001, "grad_norm": 0.005207656417042017, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.064746, "kv_vq_loss": 0.000976, "learning_rate": 0.0007499999999999999, "loss": 0.065741, "step": 1000, "value_mse_loss_layer_000": 0.001144, "value_mse_loss_layer_001": 0.003098, "value_mse_loss_layer_002": 0.012268, "value_mse_loss_layer_003": 0.019897, "value_mse_loss_layer_004": 0.0177, "value_mse_loss_layer_005": 0.0177, "value_mse_loss_layer_006": 0.020874, "value_mse_loss_layer_007": 0.02417, "value_mse_loss_layer_008": 0.028076, "value_mse_loss_layer_009": 0.036133, "value_mse_loss_layer_010": 0.036133, "value_mse_loss_layer_011": 0.034668, "value_mse_loss_layer_012": 0.035645, "value_mse_loss_layer_013": 0.035889, "value_mse_loss_layer_014": 0.038086, "value_mse_loss_layer_015": 0.039307, "value_mse_loss_layer_016": 0.04541, "value_mse_loss_layer_017": 0.03833, "value_mse_loss_layer_018": 0.04126, "value_mse_loss_layer_019": 0.045166, "value_mse_loss_layer_020": 0.053467, "value_mse_loss_layer_021": 0.057861, "value_mse_loss_layer_022": 0.050781, "value_mse_loss_layer_023": 0.062256, "value_mse_loss_layer_024": 0.070312, "value_mse_loss_layer_025": 0.08252, "value_mse_loss_layer_026": 0.064941, "value_mse_loss_layer_027": 0.092773, "value_mse_loss_layer_028": 0.084473, "value_mse_loss_layer_029": 0.129883, "value_mse_loss_layer_030": 0.115723, "value_mse_loss_layer_031": 0.154297, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 0.000124, "vq_loss_layer_005": 0.00016, "vq_loss_layer_006": 0.000227, "vq_loss_layer_007": 0.000381, "vq_loss_layer_008": 0.000345, "vq_loss_layer_009": 0.000441, "vq_loss_layer_010": 0.000446, "vq_loss_layer_011": 0.000473, "vq_loss_layer_012": 0.000759, "vq_loss_layer_013": 0.000542, "vq_loss_layer_014": 0.000656, "vq_loss_layer_015": 0.000801, "vq_loss_layer_016": 0.001022, "vq_loss_layer_017": 0.000633, "vq_loss_layer_018": 0.000454, "vq_loss_layer_019": 0.000294, "vq_loss_layer_020": 0.000433, "vq_loss_layer_021": 0.000679, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000519, "vq_loss_layer_024": 0.000437, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000748, "vq_loss_layer_027": 0.000973, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.002136, "vq_loss_layer_030": 0.003235, "vq_loss_layer_031": 0.009094 }, { "ce_loss": 2.313318, "epoch": 0.00101, "grad_norm": 0.006243514828383923, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.064609, "kv_vq_loss": 0.001031, "learning_rate": 0.0007510803434456605, "loss": 0.065625, "step": 1010, "value_mse_loss_layer_000": 0.001167, "value_mse_loss_layer_001": 0.003128, "value_mse_loss_layer_002": 0.013733, "value_mse_loss_layer_003": 0.019653, "value_mse_loss_layer_004": 0.017944, "value_mse_loss_layer_005": 0.017944, "value_mse_loss_layer_006": 0.020752, "value_mse_loss_layer_007": 0.023804, "value_mse_loss_layer_008": 0.027222, "value_mse_loss_layer_009": 0.0354, "value_mse_loss_layer_010": 0.031494, "value_mse_loss_layer_011": 0.03418, "value_mse_loss_layer_012": 0.034424, "value_mse_loss_layer_013": 0.038086, "value_mse_loss_layer_014": 0.037109, "value_mse_loss_layer_015": 0.038818, "value_mse_loss_layer_016": 0.035156, "value_mse_loss_layer_017": 0.037842, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.043457, "value_mse_loss_layer_020": 0.04541, "value_mse_loss_layer_021": 0.061768, "value_mse_loss_layer_022": 0.050781, "value_mse_loss_layer_023": 0.062012, "value_mse_loss_layer_024": 0.063965, "value_mse_loss_layer_025": 0.092285, "value_mse_loss_layer_026": 0.069336, "value_mse_loss_layer_027": 0.088379, "value_mse_loss_layer_028": 0.092773, "value_mse_loss_layer_029": 0.128906, "value_mse_loss_layer_030": 0.116211, "value_mse_loss_layer_031": 0.158203, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000107, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000215, "vq_loss_layer_007": 0.000326, "vq_loss_layer_008": 0.000332, "vq_loss_layer_009": 0.000437, "vq_loss_layer_010": 0.000393, "vq_loss_layer_011": 0.000465, "vq_loss_layer_012": 0.000729, "vq_loss_layer_013": 0.000721, "vq_loss_layer_014": 0.000687, "vq_loss_layer_015": 0.000736, "vq_loss_layer_016": 0.00071, "vq_loss_layer_017": 0.000595, "vq_loss_layer_018": 0.000374, "vq_loss_layer_019": 0.00028, "vq_loss_layer_020": 0.000341, "vq_loss_layer_021": 0.000896, "vq_loss_layer_022": 0.000399, "vq_loss_layer_023": 0.000546, "vq_loss_layer_024": 0.000448, "vq_loss_layer_025": 0.000675, "vq_loss_layer_026": 0.001068, "vq_loss_layer_027": 0.00095, "vq_loss_layer_028": 0.001503, "vq_loss_layer_029": 0.002304, "vq_loss_layer_030": 0.003448, "vq_loss_layer_031": 0.01001 }, { "ce_loss": 2.317229, "epoch": 0.00102, "grad_norm": 0.005101196467876434, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.064709, "kv_vq_loss": 0.001007, "learning_rate": 0.0007521500429404793, "loss": 0.065741, "step": 1020, "value_mse_loss_layer_000": 0.001175, "value_mse_loss_layer_001": 0.003174, "value_mse_loss_layer_002": 0.012512, "value_mse_loss_layer_003": 0.019531, "value_mse_loss_layer_004": 0.017944, "value_mse_loss_layer_005": 0.018799, "value_mse_loss_layer_006": 0.022217, "value_mse_loss_layer_007": 0.024658, "value_mse_loss_layer_008": 0.029175, "value_mse_loss_layer_009": 0.036377, "value_mse_loss_layer_010": 0.032227, "value_mse_loss_layer_011": 0.03418, "value_mse_loss_layer_012": 0.035889, "value_mse_loss_layer_013": 0.036377, "value_mse_loss_layer_014": 0.037842, "value_mse_loss_layer_015": 0.039062, "value_mse_loss_layer_016": 0.035156, "value_mse_loss_layer_017": 0.038574, "value_mse_loss_layer_018": 0.037598, "value_mse_loss_layer_019": 0.044189, "value_mse_loss_layer_020": 0.045898, "value_mse_loss_layer_021": 0.055908, "value_mse_loss_layer_022": 0.051025, "value_mse_loss_layer_023": 0.064941, "value_mse_loss_layer_024": 0.068848, "value_mse_loss_layer_025": 0.080078, "value_mse_loss_layer_026": 0.066895, "value_mse_loss_layer_027": 0.085938, "value_mse_loss_layer_028": 0.085938, "value_mse_loss_layer_029": 0.132812, "value_mse_loss_layer_030": 0.114258, "value_mse_loss_layer_031": 0.15332, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000129, "vq_loss_layer_005": 0.00016, "vq_loss_layer_006": 0.000299, "vq_loss_layer_007": 0.000368, "vq_loss_layer_008": 0.00041, "vq_loss_layer_009": 0.000465, "vq_loss_layer_010": 0.000414, "vq_loss_layer_011": 0.000429, "vq_loss_layer_012": 0.000797, "vq_loss_layer_013": 0.000614, "vq_loss_layer_014": 0.000698, "vq_loss_layer_015": 0.000748, "vq_loss_layer_016": 0.000694, "vq_loss_layer_017": 0.000603, "vq_loss_layer_018": 0.000359, "vq_loss_layer_019": 0.000301, "vq_loss_layer_020": 0.00036, "vq_loss_layer_021": 0.000729, "vq_loss_layer_022": 0.000408, "vq_loss_layer_023": 0.000587, "vq_loss_layer_024": 0.00046, "vq_loss_layer_025": 0.000565, "vq_loss_layer_026": 0.000793, "vq_loss_layer_027": 0.000839, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.00238, "vq_loss_layer_030": 0.003387, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.262468, "epoch": 0.00103, "grad_norm": 0.005488668568432331, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.064206, "kv_vq_loss": 0.00097, "learning_rate": 0.000753209306176293, "loss": 0.065204, "step": 1030, "value_mse_loss_layer_000": 0.001144, "value_mse_loss_layer_001": 0.003098, "value_mse_loss_layer_002": 0.012268, "value_mse_loss_layer_003": 0.02124, "value_mse_loss_layer_004": 0.017822, "value_mse_loss_layer_005": 0.017456, "value_mse_loss_layer_006": 0.021118, "value_mse_loss_layer_007": 0.023438, "value_mse_loss_layer_008": 0.027588, "value_mse_loss_layer_009": 0.0354, "value_mse_loss_layer_010": 0.031494, "value_mse_loss_layer_011": 0.033691, "value_mse_loss_layer_012": 0.03418, "value_mse_loss_layer_013": 0.034424, "value_mse_loss_layer_014": 0.037842, "value_mse_loss_layer_015": 0.038574, "value_mse_loss_layer_016": 0.039062, "value_mse_loss_layer_017": 0.037109, "value_mse_loss_layer_018": 0.042969, "value_mse_loss_layer_019": 0.043213, "value_mse_loss_layer_020": 0.049072, "value_mse_loss_layer_021": 0.056641, "value_mse_loss_layer_022": 0.052002, "value_mse_loss_layer_023": 0.060303, "value_mse_loss_layer_024": 0.068359, "value_mse_loss_layer_025": 0.082031, "value_mse_loss_layer_026": 0.066895, "value_mse_loss_layer_027": 0.097168, "value_mse_loss_layer_028": 0.085938, "value_mse_loss_layer_029": 0.133789, "value_mse_loss_layer_030": 0.119629, "value_mse_loss_layer_031": 0.161133, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 4.8e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 0.000135, "vq_loss_layer_006": 0.00025, "vq_loss_layer_007": 0.000351, "vq_loss_layer_008": 0.00034, "vq_loss_layer_009": 0.000462, "vq_loss_layer_010": 0.000385, "vq_loss_layer_011": 0.000467, "vq_loss_layer_012": 0.000721, "vq_loss_layer_013": 0.000546, "vq_loss_layer_014": 0.000721, "vq_loss_layer_015": 0.000801, "vq_loss_layer_016": 0.000835, "vq_loss_layer_017": 0.000664, "vq_loss_layer_018": 0.000422, "vq_loss_layer_019": 0.000311, "vq_loss_layer_020": 0.000385, "vq_loss_layer_021": 0.000751, "vq_loss_layer_022": 0.000401, "vq_loss_layer_023": 0.000458, "vq_loss_layer_024": 0.000441, "vq_loss_layer_025": 0.000549, "vq_loss_layer_026": 0.000828, "vq_loss_layer_027": 0.001091, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.002304, "vq_loss_layer_030": 0.003387, "vq_loss_layer_031": 0.009399 }, { "ce_loss": 2.307307, "epoch": 0.00104, "grad_norm": 0.007477556820958853, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.064667, "kv_vq_loss": 0.000991, "learning_rate": 0.0007542583348246949, "loss": 0.065662, "step": 1040, "value_mse_loss_layer_000": 0.001106, "value_mse_loss_layer_001": 0.003098, "value_mse_loss_layer_002": 0.012085, "value_mse_loss_layer_003": 0.020874, "value_mse_loss_layer_004": 0.018799, "value_mse_loss_layer_005": 0.017578, "value_mse_loss_layer_006": 0.020874, "value_mse_loss_layer_007": 0.024902, "value_mse_loss_layer_008": 0.026978, "value_mse_loss_layer_009": 0.035156, "value_mse_loss_layer_010": 0.030396, "value_mse_loss_layer_011": 0.033691, "value_mse_loss_layer_012": 0.034424, "value_mse_loss_layer_013": 0.034912, "value_mse_loss_layer_014": 0.038086, "value_mse_loss_layer_015": 0.036621, "value_mse_loss_layer_016": 0.03418, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.038574, "value_mse_loss_layer_019": 0.057617, "value_mse_loss_layer_020": 0.047119, "value_mse_loss_layer_021": 0.057861, "value_mse_loss_layer_022": 0.052979, "value_mse_loss_layer_023": 0.062256, "value_mse_loss_layer_024": 0.064941, "value_mse_loss_layer_025": 0.078613, "value_mse_loss_layer_026": 0.067871, "value_mse_loss_layer_027": 0.095703, "value_mse_loss_layer_028": 0.087402, "value_mse_loss_layer_029": 0.161133, "value_mse_loss_layer_030": 0.12207, "value_mse_loss_layer_031": 0.157227, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.000172, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000239, "vq_loss_layer_007": 0.000389, "vq_loss_layer_008": 0.000366, "vq_loss_layer_009": 0.000437, "vq_loss_layer_010": 0.000385, "vq_loss_layer_011": 0.000441, "vq_loss_layer_012": 0.000702, "vq_loss_layer_013": 0.000538, "vq_loss_layer_014": 0.000717, "vq_loss_layer_015": 0.000637, "vq_loss_layer_016": 0.000706, "vq_loss_layer_017": 0.000622, "vq_loss_layer_018": 0.000401, "vq_loss_layer_019": 0.000444, "vq_loss_layer_020": 0.00037, "vq_loss_layer_021": 0.000835, "vq_loss_layer_022": 0.000444, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.000484, "vq_loss_layer_025": 0.000565, "vq_loss_layer_026": 0.00095, "vq_loss_layer_027": 0.001099, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.003204, "vq_loss_layer_030": 0.004608, "vq_loss_layer_031": 0.009216 }, { "ce_loss": 2.290381, "epoch": 0.00105, "grad_norm": 0.006046660710126162, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.064685, "kv_vq_loss": 0.001028, "learning_rate": 0.0007552973247674843, "loss": 0.065747, "step": 1050, "value_mse_loss_layer_000": 0.001144, "value_mse_loss_layer_001": 0.003052, "value_mse_loss_layer_002": 0.012329, "value_mse_loss_layer_003": 0.019043, "value_mse_loss_layer_004": 0.016968, "value_mse_loss_layer_005": 0.018311, "value_mse_loss_layer_006": 0.020996, "value_mse_loss_layer_007": 0.02356, "value_mse_loss_layer_008": 0.027344, "value_mse_loss_layer_009": 0.035645, "value_mse_loss_layer_010": 0.031982, "value_mse_loss_layer_011": 0.03418, "value_mse_loss_layer_012": 0.039062, "value_mse_loss_layer_013": 0.035889, "value_mse_loss_layer_014": 0.038086, "value_mse_loss_layer_015": 0.039551, "value_mse_loss_layer_016": 0.034424, "value_mse_loss_layer_017": 0.038574, "value_mse_loss_layer_018": 0.036621, "value_mse_loss_layer_019": 0.046143, "value_mse_loss_layer_020": 0.048096, "value_mse_loss_layer_021": 0.063965, "value_mse_loss_layer_022": 0.050781, "value_mse_loss_layer_023": 0.060791, "value_mse_loss_layer_024": 0.063477, "value_mse_loss_layer_025": 0.088379, "value_mse_loss_layer_026": 0.070312, "value_mse_loss_layer_027": 0.091309, "value_mse_loss_layer_028": 0.087402, "value_mse_loss_layer_029": 0.138672, "value_mse_loss_layer_030": 0.110352, "value_mse_loss_layer_031": 0.157227, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 0.000167, "vq_loss_layer_006": 0.000237, "vq_loss_layer_007": 0.000343, "vq_loss_layer_008": 0.000355, "vq_loss_layer_009": 0.000408, "vq_loss_layer_010": 0.000381, "vq_loss_layer_011": 0.000441, "vq_loss_layer_012": 0.000999, "vq_loss_layer_013": 0.000553, "vq_loss_layer_014": 0.000687, "vq_loss_layer_015": 0.000927, "vq_loss_layer_016": 0.000648, "vq_loss_layer_017": 0.000652, "vq_loss_layer_018": 0.00033, "vq_loss_layer_019": 0.000372, "vq_loss_layer_020": 0.00041, "vq_loss_layer_021": 0.000813, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000471, "vq_loss_layer_024": 0.000399, "vq_loss_layer_025": 0.000557, "vq_loss_layer_026": 0.000973, "vq_loss_layer_027": 0.000908, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.003052, "vq_loss_layer_031": 0.009094 }, { "ce_loss": 2.267513, "epoch": 0.00106, "grad_norm": 0.00618100306019187, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.064191, "kv_vq_loss": 0.00099, "learning_rate": 0.0007563264663161925, "loss": 0.065176, "step": 1060, "value_mse_loss_layer_000": 0.001076, "value_mse_loss_layer_001": 0.003036, "value_mse_loss_layer_002": 0.012329, "value_mse_loss_layer_003": 0.019165, "value_mse_loss_layer_004": 0.018433, "value_mse_loss_layer_005": 0.018066, "value_mse_loss_layer_006": 0.021973, "value_mse_loss_layer_007": 0.024658, "value_mse_loss_layer_008": 0.02771, "value_mse_loss_layer_009": 0.036865, "value_mse_loss_layer_010": 0.032959, "value_mse_loss_layer_011": 0.036865, "value_mse_loss_layer_012": 0.037842, "value_mse_loss_layer_013": 0.037354, "value_mse_loss_layer_014": 0.04248, "value_mse_loss_layer_015": 0.039062, "value_mse_loss_layer_016": 0.036865, "value_mse_loss_layer_017": 0.038818, "value_mse_loss_layer_018": 0.041016, "value_mse_loss_layer_019": 0.046143, "value_mse_loss_layer_020": 0.052002, "value_mse_loss_layer_021": 0.059814, "value_mse_loss_layer_022": 0.049805, "value_mse_loss_layer_023": 0.061768, "value_mse_loss_layer_024": 0.067383, "value_mse_loss_layer_025": 0.085938, "value_mse_loss_layer_026": 0.069824, "value_mse_loss_layer_027": 0.100098, "value_mse_loss_layer_028": 0.089355, "value_mse_loss_layer_029": 0.136719, "value_mse_loss_layer_030": 0.124023, "value_mse_loss_layer_031": 0.168945, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 0.000125, "vq_loss_layer_005": 0.000158, "vq_loss_layer_006": 0.000284, "vq_loss_layer_007": 0.000347, "vq_loss_layer_008": 0.000389, "vq_loss_layer_009": 0.000488, "vq_loss_layer_010": 0.000479, "vq_loss_layer_011": 0.000553, "vq_loss_layer_012": 0.000832, "vq_loss_layer_013": 0.000675, "vq_loss_layer_014": 0.001022, "vq_loss_layer_015": 0.00082, "vq_loss_layer_016": 0.000813, "vq_loss_layer_017": 0.000698, "vq_loss_layer_018": 0.000549, "vq_loss_layer_019": 0.000351, "vq_loss_layer_020": 0.000431, "vq_loss_layer_021": 0.000809, "vq_loss_layer_022": 0.000418, "vq_loss_layer_023": 0.00058, "vq_loss_layer_024": 0.000542, "vq_loss_layer_025": 0.000736, "vq_loss_layer_026": 0.000942, "vq_loss_layer_027": 0.001343, "vq_loss_layer_028": 0.002182, "vq_loss_layer_029": 0.002686, "vq_loss_layer_030": 0.003632, "vq_loss_layer_031": 0.010498 }, { "ce_loss": 2.277746, "epoch": 0.00107, "grad_norm": 0.0073477281257510185, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.064307, "kv_vq_loss": 0.00098, "learning_rate": 0.0007573459444213023, "loss": 0.065308, "step": 1070, "value_mse_loss_layer_000": 0.001129, "value_mse_loss_layer_001": 0.003052, "value_mse_loss_layer_002": 0.012024, "value_mse_loss_layer_003": 0.020386, "value_mse_loss_layer_004": 0.018311, "value_mse_loss_layer_005": 0.017578, "value_mse_loss_layer_006": 0.020264, "value_mse_loss_layer_007": 0.02356, "value_mse_loss_layer_008": 0.027222, "value_mse_loss_layer_009": 0.0354, "value_mse_loss_layer_010": 0.0354, "value_mse_loss_layer_011": 0.032959, "value_mse_loss_layer_012": 0.033691, "value_mse_loss_layer_013": 0.034424, "value_mse_loss_layer_014": 0.03833, "value_mse_loss_layer_015": 0.038574, "value_mse_loss_layer_016": 0.0354, "value_mse_loss_layer_017": 0.039551, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.042969, "value_mse_loss_layer_020": 0.045166, "value_mse_loss_layer_021": 0.057129, "value_mse_loss_layer_022": 0.055664, "value_mse_loss_layer_023": 0.05957, "value_mse_loss_layer_024": 0.094238, "value_mse_loss_layer_025": 0.092773, "value_mse_loss_layer_026": 0.066895, "value_mse_loss_layer_027": 0.094238, "value_mse_loss_layer_028": 0.088867, "value_mse_loss_layer_029": 0.143555, "value_mse_loss_layer_030": 0.118652, "value_mse_loss_layer_031": 0.157227, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.8e-05, "vq_loss_layer_004": 0.000133, "vq_loss_layer_005": 0.000133, "vq_loss_layer_006": 0.000215, "vq_loss_layer_007": 0.000343, "vq_loss_layer_008": 0.000362, "vq_loss_layer_009": 0.000479, "vq_loss_layer_010": 0.000454, "vq_loss_layer_011": 0.000441, "vq_loss_layer_012": 0.000694, "vq_loss_layer_013": 0.00058, "vq_loss_layer_014": 0.000763, "vq_loss_layer_015": 0.000835, "vq_loss_layer_016": 0.000763, "vq_loss_layer_017": 0.000748, "vq_loss_layer_018": 0.000349, "vq_loss_layer_019": 0.000284, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.000805, "vq_loss_layer_022": 0.000488, "vq_loss_layer_023": 0.000481, "vq_loss_layer_024": 0.000797, "vq_loss_layer_025": 0.000732, "vq_loss_layer_026": 0.000813, "vq_loss_layer_027": 0.001099, "vq_loss_layer_028": 0.001381, "vq_loss_layer_029": 0.002777, "vq_loss_layer_030": 0.003372, "vq_loss_layer_031": 0.010315 }, { "ce_loss": 2.305306, "epoch": 0.00108, "grad_norm": 0.005259781610220671, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.064413, "kv_vq_loss": 0.000982, "learning_rate": 0.0007583559388717374, "loss": 0.065414, "step": 1080, "value_mse_loss_layer_000": 0.001152, "value_mse_loss_layer_001": 0.003067, "value_mse_loss_layer_002": 0.012939, "value_mse_loss_layer_003": 0.019897, "value_mse_loss_layer_004": 0.016968, "value_mse_loss_layer_005": 0.0177, "value_mse_loss_layer_006": 0.020752, "value_mse_loss_layer_007": 0.02356, "value_mse_loss_layer_008": 0.026611, "value_mse_loss_layer_009": 0.036865, "value_mse_loss_layer_010": 0.030884, "value_mse_loss_layer_011": 0.034668, "value_mse_loss_layer_012": 0.037109, "value_mse_loss_layer_013": 0.035156, "value_mse_loss_layer_014": 0.037109, "value_mse_loss_layer_015": 0.038574, "value_mse_loss_layer_016": 0.0354, "value_mse_loss_layer_017": 0.037354, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.04248, "value_mse_loss_layer_020": 0.044922, "value_mse_loss_layer_021": 0.055176, "value_mse_loss_layer_022": 0.051025, "value_mse_loss_layer_023": 0.064453, "value_mse_loss_layer_024": 0.066406, "value_mse_loss_layer_025": 0.086426, "value_mse_loss_layer_026": 0.063477, "value_mse_loss_layer_027": 0.08252, "value_mse_loss_layer_028": 0.083008, "value_mse_loss_layer_029": 0.128906, "value_mse_loss_layer_030": 0.10791, "value_mse_loss_layer_031": 0.150391, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.000148, "vq_loss_layer_006": 0.000236, "vq_loss_layer_007": 0.000341, "vq_loss_layer_008": 0.000322, "vq_loss_layer_009": 0.000519, "vq_loss_layer_010": 0.000383, "vq_loss_layer_011": 0.000511, "vq_loss_layer_012": 0.000893, "vq_loss_layer_013": 0.000637, "vq_loss_layer_014": 0.000671, "vq_loss_layer_015": 0.000694, "vq_loss_layer_016": 0.000721, "vq_loss_layer_017": 0.000614, "vq_loss_layer_018": 0.000372, "vq_loss_layer_019": 0.000303, "vq_loss_layer_020": 0.000376, "vq_loss_layer_021": 0.000782, "vq_loss_layer_022": 0.000542, "vq_loss_layer_023": 0.000683, "vq_loss_layer_024": 0.000561, "vq_loss_layer_025": 0.000618, "vq_loss_layer_026": 0.000862, "vq_loss_layer_027": 0.000858, "vq_loss_layer_028": 0.001198, "vq_loss_layer_029": 0.002167, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.009277 }, { "ce_loss": 2.293402, "epoch": 0.00109, "grad_norm": 0.005493406672030687, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.064273, "kv_vq_loss": 0.000988, "learning_rate": 0.0007593566244851558, "loss": 0.065292, "step": 1090, "value_mse_loss_layer_000": 0.001198, "value_mse_loss_layer_001": 0.003128, "value_mse_loss_layer_002": 0.012451, "value_mse_loss_layer_003": 0.019897, "value_mse_loss_layer_004": 0.018311, "value_mse_loss_layer_005": 0.018311, "value_mse_loss_layer_006": 0.02124, "value_mse_loss_layer_007": 0.023926, "value_mse_loss_layer_008": 0.027832, "value_mse_loss_layer_009": 0.034912, "value_mse_loss_layer_010": 0.030518, "value_mse_loss_layer_011": 0.032715, "value_mse_loss_layer_012": 0.033936, "value_mse_loss_layer_013": 0.034668, "value_mse_loss_layer_014": 0.040039, "value_mse_loss_layer_015": 0.038574, "value_mse_loss_layer_016": 0.036133, "value_mse_loss_layer_017": 0.038086, "value_mse_loss_layer_018": 0.037842, "value_mse_loss_layer_019": 0.044189, "value_mse_loss_layer_020": 0.047607, "value_mse_loss_layer_021": 0.059082, "value_mse_loss_layer_022": 0.051025, "value_mse_loss_layer_023": 0.063965, "value_mse_loss_layer_024": 0.068359, "value_mse_loss_layer_025": 0.089355, "value_mse_loss_layer_026": 0.074219, "value_mse_loss_layer_027": 0.092773, "value_mse_loss_layer_028": 0.093262, "value_mse_loss_layer_029": 0.140625, "value_mse_loss_layer_030": 0.122559, "value_mse_loss_layer_031": 0.161133, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000111, "vq_loss_layer_005": 0.000139, "vq_loss_layer_006": 0.000265, "vq_loss_layer_007": 0.000341, "vq_loss_layer_008": 0.000357, "vq_loss_layer_009": 0.000431, "vq_loss_layer_010": 0.000391, "vq_loss_layer_011": 0.000465, "vq_loss_layer_012": 0.000736, "vq_loss_layer_013": 0.000549, "vq_loss_layer_014": 0.00069, "vq_loss_layer_015": 0.000797, "vq_loss_layer_016": 0.000717, "vq_loss_layer_017": 0.000614, "vq_loss_layer_018": 0.000496, "vq_loss_layer_019": 0.000328, "vq_loss_layer_020": 0.000431, "vq_loss_layer_021": 0.000763, "vq_loss_layer_022": 0.000416, "vq_loss_layer_023": 0.000607, "vq_loss_layer_024": 0.000479, "vq_loss_layer_025": 0.00066, "vq_loss_layer_026": 0.001053, "vq_loss_layer_027": 0.001053, "vq_loss_layer_028": 0.001556, "vq_loss_layer_029": 0.002518, "vq_loss_layer_030": 0.003647, "vq_loss_layer_031": 0.010681 }, { "ce_loss": 2.320358, "epoch": 0.0011, "grad_norm": 0.005328066647052765, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.063885, "kv_vq_loss": 0.000951, "learning_rate": 0.0007603481712895561, "loss": 0.064868, "step": 1100, "value_mse_loss_layer_000": 0.001099, "value_mse_loss_layer_001": 0.003021, "value_mse_loss_layer_002": 0.011902, "value_mse_loss_layer_003": 0.019775, "value_mse_loss_layer_004": 0.017944, "value_mse_loss_layer_005": 0.017456, "value_mse_loss_layer_006": 0.021118, "value_mse_loss_layer_007": 0.022949, "value_mse_loss_layer_008": 0.0271, "value_mse_loss_layer_009": 0.035156, "value_mse_loss_layer_010": 0.033691, "value_mse_loss_layer_011": 0.032959, "value_mse_loss_layer_012": 0.03418, "value_mse_loss_layer_013": 0.0354, "value_mse_loss_layer_014": 0.037109, "value_mse_loss_layer_015": 0.037842, "value_mse_loss_layer_016": 0.035156, "value_mse_loss_layer_017": 0.039795, "value_mse_loss_layer_018": 0.03833, "value_mse_loss_layer_019": 0.051514, "value_mse_loss_layer_020": 0.044678, "value_mse_loss_layer_021": 0.056641, "value_mse_loss_layer_022": 0.051758, "value_mse_loss_layer_023": 0.063477, "value_mse_loss_layer_024": 0.069336, "value_mse_loss_layer_025": 0.085938, "value_mse_loss_layer_026": 0.066406, "value_mse_loss_layer_027": 0.095703, "value_mse_loss_layer_028": 0.088379, "value_mse_loss_layer_029": 0.138672, "value_mse_loss_layer_030": 0.120117, "value_mse_loss_layer_031": 0.150391, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000162, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000254, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.000334, "vq_loss_layer_009": 0.000439, "vq_loss_layer_010": 0.000404, "vq_loss_layer_011": 0.000399, "vq_loss_layer_012": 0.000713, "vq_loss_layer_013": 0.000603, "vq_loss_layer_014": 0.000652, "vq_loss_layer_015": 0.000671, "vq_loss_layer_016": 0.000732, "vq_loss_layer_017": 0.000984, "vq_loss_layer_018": 0.000431, "vq_loss_layer_019": 0.000372, "vq_loss_layer_020": 0.000334, "vq_loss_layer_021": 0.000702, "vq_loss_layer_022": 0.000435, "vq_loss_layer_023": 0.000553, "vq_loss_layer_024": 0.00053, "vq_loss_layer_025": 0.000687, "vq_loss_layer_026": 0.001015, "vq_loss_layer_027": 0.001465, "vq_loss_layer_028": 0.002121, "vq_loss_layer_029": 0.004669, "vq_loss_layer_030": 0.004883, "vq_loss_layer_031": 0.012329 }, { "ce_loss": 2.319897, "epoch": 0.00111, "grad_norm": 0.006790840066969395, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.063937, "kv_vq_loss": 0.000981, "learning_rate": 0.0007613307446966643, "loss": 0.064941, "step": 1110, "value_mse_loss_layer_000": 0.001129, "value_mse_loss_layer_001": 0.003021, "value_mse_loss_layer_002": 0.012329, "value_mse_loss_layer_003": 0.019409, "value_mse_loss_layer_004": 0.018188, "value_mse_loss_layer_005": 0.020142, "value_mse_loss_layer_006": 0.020752, "value_mse_loss_layer_007": 0.02356, "value_mse_loss_layer_008": 0.027222, "value_mse_loss_layer_009": 0.036377, "value_mse_loss_layer_010": 0.030396, "value_mse_loss_layer_011": 0.03418, "value_mse_loss_layer_012": 0.033203, "value_mse_loss_layer_013": 0.034424, "value_mse_loss_layer_014": 0.037354, "value_mse_loss_layer_015": 0.037354, "value_mse_loss_layer_016": 0.038086, "value_mse_loss_layer_017": 0.035645, "value_mse_loss_layer_018": 0.039062, "value_mse_loss_layer_019": 0.042725, "value_mse_loss_layer_020": 0.047607, "value_mse_loss_layer_021": 0.053955, "value_mse_loss_layer_022": 0.049561, "value_mse_loss_layer_023": 0.060059, "value_mse_loss_layer_024": 0.0625, "value_mse_loss_layer_025": 0.090332, "value_mse_loss_layer_026": 0.080078, "value_mse_loss_layer_027": 0.091309, "value_mse_loss_layer_028": 0.087402, "value_mse_loss_layer_029": 0.142578, "value_mse_loss_layer_030": 0.119141, "value_mse_loss_layer_031": 0.164062, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.00013, "vq_loss_layer_005": 0.000208, "vq_loss_layer_006": 0.000271, "vq_loss_layer_007": 0.000362, "vq_loss_layer_008": 0.00036, "vq_loss_layer_009": 0.000553, "vq_loss_layer_010": 0.000406, "vq_loss_layer_011": 0.000526, "vq_loss_layer_012": 0.00071, "vq_loss_layer_013": 0.00058, "vq_loss_layer_014": 0.000736, "vq_loss_layer_015": 0.000805, "vq_loss_layer_016": 0.0009, "vq_loss_layer_017": 0.000618, "vq_loss_layer_018": 0.0005, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.000467, "vq_loss_layer_021": 0.000805, "vq_loss_layer_022": 0.000471, "vq_loss_layer_023": 0.000591, "vq_loss_layer_024": 0.000454, "vq_loss_layer_025": 0.000725, "vq_loss_layer_026": 0.001457, "vq_loss_layer_027": 0.001152, "vq_loss_layer_028": 0.001427, "vq_loss_layer_029": 0.003006, "vq_loss_layer_030": 0.003967, "vq_loss_layer_031": 0.011353 }, { "ce_loss": 2.326883, "epoch": 0.00112, "grad_norm": 0.004401164595037699, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.064114, "kv_vq_loss": 0.000978, "learning_rate": 0.0007623045056675453, "loss": 0.065109, "step": 1120, "value_mse_loss_layer_000": 0.001152, "value_mse_loss_layer_001": 0.003036, "value_mse_loss_layer_002": 0.012268, "value_mse_loss_layer_003": 0.019775, "value_mse_loss_layer_004": 0.016602, "value_mse_loss_layer_005": 0.017334, "value_mse_loss_layer_006": 0.02002, "value_mse_loss_layer_007": 0.023438, "value_mse_loss_layer_008": 0.027954, "value_mse_loss_layer_009": 0.0354, "value_mse_loss_layer_010": 0.030273, "value_mse_loss_layer_011": 0.033691, "value_mse_loss_layer_012": 0.037109, "value_mse_loss_layer_013": 0.034424, "value_mse_loss_layer_014": 0.036865, "value_mse_loss_layer_015": 0.038086, "value_mse_loss_layer_016": 0.0354, "value_mse_loss_layer_017": 0.037842, "value_mse_loss_layer_018": 0.039307, "value_mse_loss_layer_019": 0.04248, "value_mse_loss_layer_020": 0.045898, "value_mse_loss_layer_021": 0.05542, "value_mse_loss_layer_022": 0.051025, "value_mse_loss_layer_023": 0.058838, "value_mse_loss_layer_024": 0.062988, "value_mse_loss_layer_025": 0.07959, "value_mse_loss_layer_026": 0.067383, "value_mse_loss_layer_027": 0.089844, "value_mse_loss_layer_028": 0.085938, "value_mse_loss_layer_029": 0.137695, "value_mse_loss_layer_030": 0.112305, "value_mse_loss_layer_031": 0.145508, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000135, "vq_loss_layer_006": 0.000211, "vq_loss_layer_007": 0.000343, "vq_loss_layer_008": 0.00036, "vq_loss_layer_009": 0.000481, "vq_loss_layer_010": 0.000355, "vq_loss_layer_011": 0.00045, "vq_loss_layer_012": 0.000916, "vq_loss_layer_013": 0.000549, "vq_loss_layer_014": 0.000717, "vq_loss_layer_015": 0.000683, "vq_loss_layer_016": 0.000732, "vq_loss_layer_017": 0.000591, "vq_loss_layer_018": 0.00036, "vq_loss_layer_019": 0.000278, "vq_loss_layer_020": 0.000378, "vq_loss_layer_021": 0.000721, "vq_loss_layer_022": 0.000383, "vq_loss_layer_023": 0.000433, "vq_loss_layer_024": 0.000372, "vq_loss_layer_025": 0.000507, "vq_loss_layer_026": 0.000824, "vq_loss_layer_027": 0.000908, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.002457, "vq_loss_layer_030": 0.003815, "vq_loss_layer_031": 0.007996 }, { "ce_loss": 2.295978, "epoch": 0.00113, "grad_norm": 0.005882004741579294, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.063721, "kv_vq_loss": 0.000951, "learning_rate": 0.0007632696108708548, "loss": 0.064685, "step": 1130, "value_mse_loss_layer_000": 0.001106, "value_mse_loss_layer_001": 0.00296, "value_mse_loss_layer_002": 0.012024, "value_mse_loss_layer_003": 0.019043, "value_mse_loss_layer_004": 0.018066, "value_mse_loss_layer_005": 0.018066, "value_mse_loss_layer_006": 0.020874, "value_mse_loss_layer_007": 0.023682, "value_mse_loss_layer_008": 0.026978, "value_mse_loss_layer_009": 0.035889, "value_mse_loss_layer_010": 0.03418, "value_mse_loss_layer_011": 0.033691, "value_mse_loss_layer_012": 0.033936, "value_mse_loss_layer_013": 0.037598, "value_mse_loss_layer_014": 0.037109, "value_mse_loss_layer_015": 0.038086, "value_mse_loss_layer_016": 0.033691, "value_mse_loss_layer_017": 0.040039, "value_mse_loss_layer_018": 0.036377, "value_mse_loss_layer_019": 0.045166, "value_mse_loss_layer_020": 0.043213, "value_mse_loss_layer_021": 0.056641, "value_mse_loss_layer_022": 0.048828, "value_mse_loss_layer_023": 0.054443, "value_mse_loss_layer_024": 0.060547, "value_mse_loss_layer_025": 0.08252, "value_mse_loss_layer_026": 0.062256, "value_mse_loss_layer_027": 0.083008, "value_mse_loss_layer_028": 0.085449, "value_mse_loss_layer_029": 0.125977, "value_mse_loss_layer_030": 0.11084, "value_mse_loss_layer_031": 0.152344, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 0.000116, "vq_loss_layer_005": 0.000141, "vq_loss_layer_006": 0.000239, "vq_loss_layer_007": 0.00033, "vq_loss_layer_008": 0.000349, "vq_loss_layer_009": 0.000454, "vq_loss_layer_010": 0.000504, "vq_loss_layer_011": 0.000454, "vq_loss_layer_012": 0.000694, "vq_loss_layer_013": 0.000671, "vq_loss_layer_014": 0.000721, "vq_loss_layer_015": 0.00071, "vq_loss_layer_016": 0.000717, "vq_loss_layer_017": 0.000751, "vq_loss_layer_018": 0.000364, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000399, "vq_loss_layer_021": 0.000843, "vq_loss_layer_022": 0.000469, "vq_loss_layer_023": 0.000486, "vq_loss_layer_024": 0.000486, "vq_loss_layer_025": 0.000725, "vq_loss_layer_026": 0.000862, "vq_loss_layer_027": 0.001015, "vq_loss_layer_028": 0.00164, "vq_loss_layer_029": 0.002304, "vq_loss_layer_030": 0.003235, "vq_loss_layer_031": 0.010254 }, { "ce_loss": 2.273403, "epoch": 0.00114, "grad_norm": 0.005607365630567074, "key_mse_loss_layer_000": 0.004211, "key_mse_loss_layer_001": 0.011536, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.113281, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.104492, "key_mse_loss_layer_024": 0.088379, "key_mse_loss_layer_025": 0.082031, "key_mse_loss_layer_026": 0.09668, "key_mse_loss_layer_027": 0.100098, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.100586, "key_mse_loss_layer_030": 0.10498, "key_mse_loss_layer_031": 0.081543, "kv_mse_loss": 0.063809, "kv_vq_loss": 0.00099, "learning_rate": 0.0007642262128341181, "loss": 0.064807, "step": 1140, "value_mse_loss_layer_000": 0.001083, "value_mse_loss_layer_001": 0.003067, "value_mse_loss_layer_002": 0.012024, "value_mse_loss_layer_003": 0.019531, "value_mse_loss_layer_004": 0.018433, "value_mse_loss_layer_005": 0.018921, "value_mse_loss_layer_006": 0.02063, "value_mse_loss_layer_007": 0.02478, "value_mse_loss_layer_008": 0.026978, "value_mse_loss_layer_009": 0.033203, "value_mse_loss_layer_010": 0.030151, "value_mse_loss_layer_011": 0.034668, "value_mse_loss_layer_012": 0.032715, "value_mse_loss_layer_013": 0.033203, "value_mse_loss_layer_014": 0.0354, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.034424, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.038086, "value_mse_loss_layer_019": 0.043945, "value_mse_loss_layer_020": 0.045166, "value_mse_loss_layer_021": 0.054199, "value_mse_loss_layer_022": 0.048828, "value_mse_loss_layer_023": 0.067871, "value_mse_loss_layer_024": 0.075684, "value_mse_loss_layer_025": 0.087891, "value_mse_loss_layer_026": 0.073242, "value_mse_loss_layer_027": 0.104492, "value_mse_loss_layer_028": 0.092285, "value_mse_loss_layer_029": 0.155273, "value_mse_loss_layer_030": 0.133789, "value_mse_loss_layer_031": 0.172852, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000104, "vq_loss_layer_005": 0.000158, "vq_loss_layer_006": 0.000229, "vq_loss_layer_007": 0.000385, "vq_loss_layer_008": 0.000408, "vq_loss_layer_009": 0.000475, "vq_loss_layer_010": 0.000467, "vq_loss_layer_011": 0.000648, "vq_loss_layer_012": 0.000717, "vq_loss_layer_013": 0.000568, "vq_loss_layer_014": 0.000748, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000786, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000452, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000313, "vq_loss_layer_021": 0.000572, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000515, "vq_loss_layer_024": 0.000633, "vq_loss_layer_025": 0.000778, "vq_loss_layer_026": 0.000931, "vq_loss_layer_027": 0.00132, "vq_loss_layer_028": 0.00148, "vq_loss_layer_029": 0.004517, "vq_loss_layer_030": 0.004639, "vq_loss_layer_031": 0.012268 }, { "ce_loss": 2.307446, "epoch": 0.00115, "grad_norm": 0.006246138364076614, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.06348, "kv_vq_loss": 0.000953, "learning_rate": 0.0007651744600884029, "loss": 0.064471, "step": 1150, "value_mse_loss_layer_000": 0.001106, "value_mse_loss_layer_001": 0.002975, "value_mse_loss_layer_002": 0.011963, "value_mse_loss_layer_003": 0.020142, "value_mse_loss_layer_004": 0.017212, "value_mse_loss_layer_005": 0.016968, "value_mse_loss_layer_006": 0.019775, "value_mse_loss_layer_007": 0.022583, "value_mse_loss_layer_008": 0.026978, "value_mse_loss_layer_009": 0.036133, "value_mse_loss_layer_010": 0.029297, "value_mse_loss_layer_011": 0.032715, "value_mse_loss_layer_012": 0.037598, "value_mse_loss_layer_013": 0.033691, "value_mse_loss_layer_014": 0.038574, "value_mse_loss_layer_015": 0.038086, "value_mse_loss_layer_016": 0.03418, "value_mse_loss_layer_017": 0.037109, "value_mse_loss_layer_018": 0.037842, "value_mse_loss_layer_019": 0.042725, "value_mse_loss_layer_020": 0.046143, "value_mse_loss_layer_021": 0.078613, "value_mse_loss_layer_022": 0.05249, "value_mse_loss_layer_023": 0.061035, "value_mse_loss_layer_024": 0.063965, "value_mse_loss_layer_025": 0.081055, "value_mse_loss_layer_026": 0.068359, "value_mse_loss_layer_027": 0.089355, "value_mse_loss_layer_028": 0.083984, "value_mse_loss_layer_029": 0.129883, "value_mse_loss_layer_030": 0.114746, "value_mse_loss_layer_031": 0.151367, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000201, "vq_loss_layer_007": 0.000322, "vq_loss_layer_008": 0.000322, "vq_loss_layer_009": 0.000572, "vq_loss_layer_010": 0.000326, "vq_loss_layer_011": 0.000439, "vq_loss_layer_012": 0.000999, "vq_loss_layer_013": 0.000526, "vq_loss_layer_014": 0.000664, "vq_loss_layer_015": 0.000729, "vq_loss_layer_016": 0.000668, "vq_loss_layer_017": 0.000576, "vq_loss_layer_018": 0.000364, "vq_loss_layer_019": 0.00028, "vq_loss_layer_020": 0.000364, "vq_loss_layer_021": 0.00106, "vq_loss_layer_022": 0.00041, "vq_loss_layer_023": 0.000496, "vq_loss_layer_024": 0.000414, "vq_loss_layer_025": 0.00053, "vq_loss_layer_026": 0.000786, "vq_loss_layer_027": 0.000938, "vq_loss_layer_028": 0.00106, "vq_loss_layer_029": 0.002228, "vq_loss_layer_030": 0.003799, "vq_loss_layer_031": 0.008789 }, { "ce_loss": 2.310362, "epoch": 0.00116, "grad_norm": 0.005680642556399107, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.063837, "kv_vq_loss": 0.000951, "learning_rate": 0.0007661144973067296, "loss": 0.06485, "step": 1160, "value_mse_loss_layer_000": 0.001106, "value_mse_loss_layer_001": 0.003021, "value_mse_loss_layer_002": 0.012451, "value_mse_loss_layer_003": 0.019165, "value_mse_loss_layer_004": 0.017212, "value_mse_loss_layer_005": 0.016968, "value_mse_loss_layer_006": 0.021851, "value_mse_loss_layer_007": 0.022339, "value_mse_loss_layer_008": 0.026123, "value_mse_loss_layer_009": 0.034424, "value_mse_loss_layer_010": 0.03064, "value_mse_loss_layer_011": 0.032715, "value_mse_loss_layer_012": 0.033936, "value_mse_loss_layer_013": 0.035889, "value_mse_loss_layer_014": 0.035645, "value_mse_loss_layer_015": 0.036865, "value_mse_loss_layer_016": 0.034668, "value_mse_loss_layer_017": 0.036865, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.04248, "value_mse_loss_layer_020": 0.043701, "value_mse_loss_layer_021": 0.053711, "value_mse_loss_layer_022": 0.049561, "value_mse_loss_layer_023": 0.059326, "value_mse_loss_layer_024": 0.0625, "value_mse_loss_layer_025": 0.080566, "value_mse_loss_layer_026": 0.066406, "value_mse_loss_layer_027": 0.088379, "value_mse_loss_layer_028": 0.091797, "value_mse_loss_layer_029": 0.131836, "value_mse_loss_layer_030": 0.114258, "value_mse_loss_layer_031": 0.148438, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000303, "vq_loss_layer_007": 0.00034, "vq_loss_layer_008": 0.000313, "vq_loss_layer_009": 0.000462, "vq_loss_layer_010": 0.000372, "vq_loss_layer_011": 0.000414, "vq_loss_layer_012": 0.000759, "vq_loss_layer_013": 0.00066, "vq_loss_layer_014": 0.000648, "vq_loss_layer_015": 0.000694, "vq_loss_layer_016": 0.000736, "vq_loss_layer_017": 0.000675, "vq_loss_layer_018": 0.000359, "vq_loss_layer_019": 0.000311, "vq_loss_layer_020": 0.000351, "vq_loss_layer_021": 0.000671, "vq_loss_layer_022": 0.000374, "vq_loss_layer_023": 0.000511, "vq_loss_layer_024": 0.000423, "vq_loss_layer_025": 0.000542, "vq_loss_layer_026": 0.000809, "vq_loss_layer_027": 0.000977, "vq_loss_layer_028": 0.001457, "vq_loss_layer_029": 0.002106, "vq_loss_layer_030": 0.003021, "vq_loss_layer_031": 0.008423 }, { "ce_loss": 2.280994, "epoch": 0.00117, "grad_norm": 0.0062036169692873955, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.062256, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.063254, "kv_vq_loss": 0.000949, "learning_rate": 0.0007670464654365403, "loss": 0.064233, "step": 1170, "value_mse_loss_layer_000": 0.001114, "value_mse_loss_layer_001": 0.003052, "value_mse_loss_layer_002": 0.011902, "value_mse_loss_layer_003": 0.021606, "value_mse_loss_layer_004": 0.016357, "value_mse_loss_layer_005": 0.016357, "value_mse_loss_layer_006": 0.020264, "value_mse_loss_layer_007": 0.022705, "value_mse_loss_layer_008": 0.026001, "value_mse_loss_layer_009": 0.03418, "value_mse_loss_layer_010": 0.029175, "value_mse_loss_layer_011": 0.032471, "value_mse_loss_layer_012": 0.032715, "value_mse_loss_layer_013": 0.036865, "value_mse_loss_layer_014": 0.036865, "value_mse_loss_layer_015": 0.038818, "value_mse_loss_layer_016": 0.033936, "value_mse_loss_layer_017": 0.036865, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.042725, "value_mse_loss_layer_020": 0.044922, "value_mse_loss_layer_021": 0.054443, "value_mse_loss_layer_022": 0.050049, "value_mse_loss_layer_023": 0.059814, "value_mse_loss_layer_024": 0.066406, "value_mse_loss_layer_025": 0.084473, "value_mse_loss_layer_026": 0.064453, "value_mse_loss_layer_027": 0.087402, "value_mse_loss_layer_028": 0.085449, "value_mse_loss_layer_029": 0.126953, "value_mse_loss_layer_030": 0.114746, "value_mse_loss_layer_031": 0.15625, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000121, "vq_loss_layer_006": 0.000228, "vq_loss_layer_007": 0.00038, "vq_loss_layer_008": 0.000303, "vq_loss_layer_009": 0.000439, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000393, "vq_loss_layer_012": 0.000675, "vq_loss_layer_013": 0.000702, "vq_loss_layer_014": 0.000648, "vq_loss_layer_015": 0.000755, "vq_loss_layer_016": 0.000648, "vq_loss_layer_017": 0.000587, "vq_loss_layer_018": 0.000364, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000443, "vq_loss_layer_024": 0.000376, "vq_loss_layer_025": 0.000441, "vq_loss_layer_026": 0.000633, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001869, "vq_loss_layer_030": 0.002914, "vq_loss_layer_031": 0.008545 }, { "ce_loss": 2.262923, "epoch": 0.00118, "grad_norm": 0.00557409692555666, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.06431, "kv_vq_loss": 0.000968, "learning_rate": 0.0007679705018265312, "loss": 0.065302, "step": 1180, "value_mse_loss_layer_000": 0.001099, "value_mse_loss_layer_001": 0.002975, "value_mse_loss_layer_002": 0.011719, "value_mse_loss_layer_003": 0.018921, "value_mse_loss_layer_004": 0.017822, "value_mse_loss_layer_005": 0.0177, "value_mse_loss_layer_006": 0.019775, "value_mse_loss_layer_007": 0.022949, "value_mse_loss_layer_008": 0.0271, "value_mse_loss_layer_009": 0.034424, "value_mse_loss_layer_010": 0.029907, "value_mse_loss_layer_011": 0.031982, "value_mse_loss_layer_012": 0.037109, "value_mse_loss_layer_013": 0.03418, "value_mse_loss_layer_014": 0.037354, "value_mse_loss_layer_015": 0.037842, "value_mse_loss_layer_016": 0.035156, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.044434, "value_mse_loss_layer_020": 0.049072, "value_mse_loss_layer_021": 0.056152, "value_mse_loss_layer_022": 0.050537, "value_mse_loss_layer_023": 0.060059, "value_mse_loss_layer_024": 0.070312, "value_mse_loss_layer_025": 0.080078, "value_mse_loss_layer_026": 0.066406, "value_mse_loss_layer_027": 0.088867, "value_mse_loss_layer_028": 0.087402, "value_mse_loss_layer_029": 0.165039, "value_mse_loss_layer_030": 0.117676, "value_mse_loss_layer_031": 0.148438, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.000116, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000204, "vq_loss_layer_007": 0.000328, "vq_loss_layer_008": 0.000374, "vq_loss_layer_009": 0.000475, "vq_loss_layer_010": 0.000397, "vq_loss_layer_011": 0.000414, "vq_loss_layer_012": 0.000965, "vq_loss_layer_013": 0.000557, "vq_loss_layer_014": 0.000732, "vq_loss_layer_015": 0.000706, "vq_loss_layer_016": 0.000755, "vq_loss_layer_017": 0.000584, "vq_loss_layer_018": 0.000395, "vq_loss_layer_019": 0.000322, "vq_loss_layer_020": 0.000456, "vq_loss_layer_021": 0.00082, "vq_loss_layer_022": 0.000448, "vq_loss_layer_023": 0.000572, "vq_loss_layer_024": 0.000507, "vq_loss_layer_025": 0.000584, "vq_loss_layer_026": 0.000896, "vq_loss_layer_027": 0.001045, "vq_loss_layer_028": 0.001465, "vq_loss_layer_029": 0.003311, "vq_loss_layer_030": 0.003571, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.298951, "epoch": 0.00119, "grad_norm": 0.006850963458418846, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.063681, "kv_vq_loss": 0.000943, "learning_rate": 0.0007688867403481325, "loss": 0.064651, "step": 1190, "value_mse_loss_layer_000": 0.001076, "value_mse_loss_layer_001": 0.00293, "value_mse_loss_layer_002": 0.011963, "value_mse_loss_layer_003": 0.018799, "value_mse_loss_layer_004": 0.01709, "value_mse_loss_layer_005": 0.017334, "value_mse_loss_layer_006": 0.021484, "value_mse_loss_layer_007": 0.023682, "value_mse_loss_layer_008": 0.026245, "value_mse_loss_layer_009": 0.0354, "value_mse_loss_layer_010": 0.032715, "value_mse_loss_layer_011": 0.033936, "value_mse_loss_layer_012": 0.033203, "value_mse_loss_layer_013": 0.034668, "value_mse_loss_layer_014": 0.036621, "value_mse_loss_layer_015": 0.038574, "value_mse_loss_layer_016": 0.036865, "value_mse_loss_layer_017": 0.038574, "value_mse_loss_layer_018": 0.037109, "value_mse_loss_layer_019": 0.043701, "value_mse_loss_layer_020": 0.045654, "value_mse_loss_layer_021": 0.055908, "value_mse_loss_layer_022": 0.05249, "value_mse_loss_layer_023": 0.059326, "value_mse_loss_layer_024": 0.064453, "value_mse_loss_layer_025": 0.084961, "value_mse_loss_layer_026": 0.082031, "value_mse_loss_layer_027": 0.101562, "value_mse_loss_layer_028": 0.086914, "value_mse_loss_layer_029": 0.133789, "value_mse_loss_layer_030": 0.114746, "value_mse_loss_layer_031": 0.152344, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.000126, "vq_loss_layer_006": 0.000277, "vq_loss_layer_007": 0.000355, "vq_loss_layer_008": 0.000326, "vq_loss_layer_009": 0.0005, "vq_loss_layer_010": 0.000473, "vq_loss_layer_011": 0.000481, "vq_loss_layer_012": 0.000698, "vq_loss_layer_013": 0.000553, "vq_loss_layer_014": 0.000725, "vq_loss_layer_015": 0.000748, "vq_loss_layer_016": 0.000824, "vq_loss_layer_017": 0.000664, "vq_loss_layer_018": 0.000412, "vq_loss_layer_019": 0.000317, "vq_loss_layer_020": 0.000423, "vq_loss_layer_021": 0.000782, "vq_loss_layer_022": 0.000479, "vq_loss_layer_023": 0.000565, "vq_loss_layer_024": 0.000504, "vq_loss_layer_025": 0.000641, "vq_loss_layer_026": 0.001465, "vq_loss_layer_027": 0.001266, "vq_loss_layer_028": 0.001343, "vq_loss_layer_029": 0.00235, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.009155 }, { "ce_loss": 2.304697, "epoch": 0.0012, "grad_norm": 0.005697520449757576, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.063144, "kv_vq_loss": 0.000938, "learning_rate": 0.0007697953115119061, "loss": 0.064093, "step": 1200, "value_mse_loss_layer_000": 0.001099, "value_mse_loss_layer_001": 0.003128, "value_mse_loss_layer_002": 0.012024, "value_mse_loss_layer_003": 0.020142, "value_mse_loss_layer_004": 0.017334, "value_mse_loss_layer_005": 0.017456, "value_mse_loss_layer_006": 0.020386, "value_mse_loss_layer_007": 0.022949, "value_mse_loss_layer_008": 0.026855, "value_mse_loss_layer_009": 0.036621, "value_mse_loss_layer_010": 0.032227, "value_mse_loss_layer_011": 0.032227, "value_mse_loss_layer_012": 0.032959, "value_mse_loss_layer_013": 0.033936, "value_mse_loss_layer_014": 0.036133, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.035645, "value_mse_loss_layer_018": 0.040771, "value_mse_loss_layer_019": 0.042236, "value_mse_loss_layer_020": 0.044678, "value_mse_loss_layer_021": 0.052734, "value_mse_loss_layer_022": 0.04834, "value_mse_loss_layer_023": 0.057861, "value_mse_loss_layer_024": 0.063477, "value_mse_loss_layer_025": 0.089355, "value_mse_loss_layer_026": 0.066406, "value_mse_loss_layer_027": 0.094238, "value_mse_loss_layer_028": 0.087891, "value_mse_loss_layer_029": 0.129883, "value_mse_loss_layer_030": 0.112305, "value_mse_loss_layer_031": 0.148438, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 5e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.000129, "vq_loss_layer_006": 0.000232, "vq_loss_layer_007": 0.000326, "vq_loss_layer_008": 0.000376, "vq_loss_layer_009": 0.000603, "vq_loss_layer_010": 0.000458, "vq_loss_layer_011": 0.000435, "vq_loss_layer_012": 0.000694, "vq_loss_layer_013": 0.000526, "vq_loss_layer_014": 0.000698, "vq_loss_layer_015": 0.00069, "vq_loss_layer_016": 0.000725, "vq_loss_layer_017": 0.000565, "vq_loss_layer_018": 0.000418, "vq_loss_layer_019": 0.000305, "vq_loss_layer_020": 0.000391, "vq_loss_layer_021": 0.000721, "vq_loss_layer_022": 0.000406, "vq_loss_layer_023": 0.000483, "vq_loss_layer_024": 0.000488, "vq_loss_layer_025": 0.000687, "vq_loss_layer_026": 0.000946, "vq_loss_layer_027": 0.001213, "vq_loss_layer_028": 0.001404, "vq_loss_layer_029": 0.002365, "vq_loss_layer_030": 0.003464, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.300383, "epoch": 0.00121, "grad_norm": 0.006147316657006741, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.063239, "kv_vq_loss": 0.000897, "learning_rate": 0.0007706963425791124, "loss": 0.064197, "step": 1210, "value_mse_loss_layer_000": 0.001076, "value_mse_loss_layer_001": 0.00293, "value_mse_loss_layer_002": 0.011719, "value_mse_loss_layer_003": 0.018311, "value_mse_loss_layer_004": 0.016602, "value_mse_loss_layer_005": 0.017334, "value_mse_loss_layer_006": 0.02002, "value_mse_loss_layer_007": 0.022339, "value_mse_loss_layer_008": 0.026123, "value_mse_loss_layer_009": 0.034424, "value_mse_loss_layer_010": 0.030029, "value_mse_loss_layer_011": 0.033203, "value_mse_loss_layer_012": 0.038086, "value_mse_loss_layer_013": 0.035645, "value_mse_loss_layer_014": 0.035645, "value_mse_loss_layer_015": 0.04126, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.038818, "value_mse_loss_layer_018": 0.0354, "value_mse_loss_layer_019": 0.041748, "value_mse_loss_layer_020": 0.044189, "value_mse_loss_layer_021": 0.0625, "value_mse_loss_layer_022": 0.049316, "value_mse_loss_layer_023": 0.057373, "value_mse_loss_layer_024": 0.059082, "value_mse_loss_layer_025": 0.086914, "value_mse_loss_layer_026": 0.060791, "value_mse_loss_layer_027": 0.080566, "value_mse_loss_layer_028": 0.081543, "value_mse_loss_layer_029": 0.123535, "value_mse_loss_layer_030": 0.11377, "value_mse_loss_layer_031": 0.141602, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 0.000149, "vq_loss_layer_006": 0.000229, "vq_loss_layer_007": 0.000328, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.000441, "vq_loss_layer_010": 0.000355, "vq_loss_layer_011": 0.000475, "vq_loss_layer_012": 0.001144, "vq_loss_layer_013": 0.000633, "vq_loss_layer_014": 0.000645, "vq_loss_layer_015": 0.001152, "vq_loss_layer_016": 0.000637, "vq_loss_layer_017": 0.000889, "vq_loss_layer_018": 0.000334, "vq_loss_layer_019": 0.000292, "vq_loss_layer_020": 0.00033, "vq_loss_layer_021": 0.000866, "vq_loss_layer_022": 0.000372, "vq_loss_layer_023": 0.0005, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.000549, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.001198, "vq_loss_layer_029": 0.001953, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.007629 }, { "ce_loss": 2.316685, "epoch": 0.00122, "grad_norm": 0.006998361553996801, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.063126, "kv_vq_loss": 0.000946, "learning_rate": 0.000771589957668687, "loss": 0.064105, "step": 1220, "value_mse_loss_layer_000": 0.001091, "value_mse_loss_layer_001": 0.002975, "value_mse_loss_layer_002": 0.011658, "value_mse_loss_layer_003": 0.018433, "value_mse_loss_layer_004": 0.016357, "value_mse_loss_layer_005": 0.016357, "value_mse_loss_layer_006": 0.02002, "value_mse_loss_layer_007": 0.022339, "value_mse_loss_layer_008": 0.026245, "value_mse_loss_layer_009": 0.033936, "value_mse_loss_layer_010": 0.029419, "value_mse_loss_layer_011": 0.032227, "value_mse_loss_layer_012": 0.035156, "value_mse_loss_layer_013": 0.032959, "value_mse_loss_layer_014": 0.035156, "value_mse_loss_layer_015": 0.037842, "value_mse_loss_layer_016": 0.034424, "value_mse_loss_layer_017": 0.036865, "value_mse_loss_layer_018": 0.038574, "value_mse_loss_layer_019": 0.04248, "value_mse_loss_layer_020": 0.044189, "value_mse_loss_layer_021": 0.053223, "value_mse_loss_layer_022": 0.049805, "value_mse_loss_layer_023": 0.068359, "value_mse_loss_layer_024": 0.075195, "value_mse_loss_layer_025": 0.07959, "value_mse_loss_layer_026": 0.065918, "value_mse_loss_layer_027": 0.080566, "value_mse_loss_layer_028": 0.083008, "value_mse_loss_layer_029": 0.126953, "value_mse_loss_layer_030": 0.107422, "value_mse_loss_layer_031": 0.149414, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.000118, "vq_loss_layer_006": 0.00021, "vq_loss_layer_007": 0.000319, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.000402, "vq_loss_layer_010": 0.000326, "vq_loss_layer_011": 0.000406, "vq_loss_layer_012": 0.000866, "vq_loss_layer_013": 0.000496, "vq_loss_layer_014": 0.000599, "vq_loss_layer_015": 0.000679, "vq_loss_layer_016": 0.000706, "vq_loss_layer_017": 0.000557, "vq_loss_layer_018": 0.000406, "vq_loss_layer_019": 0.000263, "vq_loss_layer_020": 0.00034, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.000385, "vq_loss_layer_023": 0.000626, "vq_loss_layer_024": 0.000504, "vq_loss_layer_025": 0.000454, "vq_loss_layer_026": 0.00079, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001984, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.008728 }, { "ce_loss": 2.261043, "epoch": 0.00123, "grad_norm": 0.005453596357256174, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.06358, "kv_vq_loss": 0.00095, "learning_rate": 0.0007724762778598493, "loss": 0.064554, "step": 1230, "value_mse_loss_layer_000": 0.001083, "value_mse_loss_layer_001": 0.002914, "value_mse_loss_layer_002": 0.012024, "value_mse_loss_layer_003": 0.019043, "value_mse_loss_layer_004": 0.017578, "value_mse_loss_layer_005": 0.017212, "value_mse_loss_layer_006": 0.020996, "value_mse_loss_layer_007": 0.023682, "value_mse_loss_layer_008": 0.025757, "value_mse_loss_layer_009": 0.034668, "value_mse_loss_layer_010": 0.03125, "value_mse_loss_layer_011": 0.032227, "value_mse_loss_layer_012": 0.033936, "value_mse_loss_layer_013": 0.03418, "value_mse_loss_layer_014": 0.037109, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.038574, "value_mse_loss_layer_017": 0.037842, "value_mse_loss_layer_018": 0.037842, "value_mse_loss_layer_019": 0.044922, "value_mse_loss_layer_020": 0.045166, "value_mse_loss_layer_021": 0.054443, "value_mse_loss_layer_022": 0.048584, "value_mse_loss_layer_023": 0.061279, "value_mse_loss_layer_024": 0.063965, "value_mse_loss_layer_025": 0.087891, "value_mse_loss_layer_026": 0.067383, "value_mse_loss_layer_027": 0.088379, "value_mse_loss_layer_028": 0.084473, "value_mse_loss_layer_029": 0.129883, "value_mse_loss_layer_030": 0.117188, "value_mse_loss_layer_031": 0.152344, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 5.5e-05, "vq_loss_layer_004": 0.000128, "vq_loss_layer_005": 0.000125, "vq_loss_layer_006": 0.000256, "vq_loss_layer_007": 0.000343, "vq_loss_layer_008": 0.000305, "vq_loss_layer_009": 0.000443, "vq_loss_layer_010": 0.000402, "vq_loss_layer_011": 0.000444, "vq_loss_layer_012": 0.000759, "vq_loss_layer_013": 0.000534, "vq_loss_layer_014": 0.000717, "vq_loss_layer_015": 0.000656, "vq_loss_layer_016": 0.000885, "vq_loss_layer_017": 0.00066, "vq_loss_layer_018": 0.000399, "vq_loss_layer_019": 0.000355, "vq_loss_layer_020": 0.000368, "vq_loss_layer_021": 0.000732, "vq_loss_layer_022": 0.00038, "vq_loss_layer_023": 0.000576, "vq_loss_layer_024": 0.000427, "vq_loss_layer_025": 0.000698, "vq_loss_layer_026": 0.000935, "vq_loss_layer_027": 0.000999, "vq_loss_layer_028": 0.001343, "vq_loss_layer_029": 0.002197, "vq_loss_layer_030": 0.003281, "vq_loss_layer_031": 0.010437 }, { "ce_loss": 2.2774, "epoch": 0.00124, "grad_norm": 0.006924309302121401, "key_mse_loss_layer_000": 0.003693, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.081543, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.083984, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.074219, "key_mse_loss_layer_030": 0.072266, "key_mse_loss_layer_031": 0.056152, "kv_mse_loss": 0.062833, "kv_vq_loss": 0.000932, "learning_rate": 0.0007733554212905587, "loss": 0.063803, "step": 1240, "value_mse_loss_layer_000": 0.00106, "value_mse_loss_layer_001": 0.002914, "value_mse_loss_layer_002": 0.011536, "value_mse_loss_layer_003": 0.018921, "value_mse_loss_layer_004": 0.017944, "value_mse_loss_layer_005": 0.0177, "value_mse_loss_layer_006": 0.02063, "value_mse_loss_layer_007": 0.023071, "value_mse_loss_layer_008": 0.026611, "value_mse_loss_layer_009": 0.034668, "value_mse_loss_layer_010": 0.029541, "value_mse_loss_layer_011": 0.031982, "value_mse_loss_layer_012": 0.033691, "value_mse_loss_layer_013": 0.034912, "value_mse_loss_layer_014": 0.038086, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.036865, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.042725, "value_mse_loss_layer_020": 0.047119, "value_mse_loss_layer_021": 0.050049, "value_mse_loss_layer_022": 0.044922, "value_mse_loss_layer_023": 0.05835, "value_mse_loss_layer_024": 0.072754, "value_mse_loss_layer_025": 0.076172, "value_mse_loss_layer_026": 0.066895, "value_mse_loss_layer_027": 0.084473, "value_mse_loss_layer_028": 0.084473, "value_mse_loss_layer_029": 0.151367, "value_mse_loss_layer_030": 0.115234, "value_mse_loss_layer_031": 0.15918, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 2.9e-05, "vq_loss_layer_002": 3.6e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 0.000144, "vq_loss_layer_005": 0.000165, "vq_loss_layer_006": 0.000292, "vq_loss_layer_007": 0.000347, "vq_loss_layer_008": 0.000437, "vq_loss_layer_009": 0.000463, "vq_loss_layer_010": 0.000467, "vq_loss_layer_011": 0.000477, "vq_loss_layer_012": 0.000744, "vq_loss_layer_013": 0.000614, "vq_loss_layer_014": 0.000927, "vq_loss_layer_015": 0.00071, "vq_loss_layer_016": 0.000916, "vq_loss_layer_017": 0.000862, "vq_loss_layer_018": 0.000463, "vq_loss_layer_019": 0.000504, "vq_loss_layer_020": 0.000412, "vq_loss_layer_021": 0.000786, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000633, "vq_loss_layer_024": 0.001045, "vq_loss_layer_025": 0.000774, "vq_loss_layer_026": 0.001213, "vq_loss_layer_027": 0.00116, "vq_loss_layer_028": 0.001839, "vq_loss_layer_029": 0.004486, "vq_loss_layer_030": 0.004425, "vq_loss_layer_031": 0.014282 }, { "ce_loss": 2.26511, "epoch": 0.00125, "grad_norm": 0.006263668183237314, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.043701, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.059082, "kv_mse_loss": 0.063394, "kv_vq_loss": 0.000925, "learning_rate": 0.000774227503252014, "loss": 0.064383, "step": 1250, "value_mse_loss_layer_000": 0.001106, "value_mse_loss_layer_001": 0.00293, "value_mse_loss_layer_002": 0.011597, "value_mse_loss_layer_003": 0.018921, "value_mse_loss_layer_004": 0.017456, "value_mse_loss_layer_005": 0.018311, "value_mse_loss_layer_006": 0.019653, "value_mse_loss_layer_007": 0.023682, "value_mse_loss_layer_008": 0.026611, "value_mse_loss_layer_009": 0.035156, "value_mse_loss_layer_010": 0.030884, "value_mse_loss_layer_011": 0.034912, "value_mse_loss_layer_012": 0.036133, "value_mse_loss_layer_013": 0.037354, "value_mse_loss_layer_014": 0.038086, "value_mse_loss_layer_015": 0.041992, "value_mse_loss_layer_016": 0.034912, "value_mse_loss_layer_017": 0.038086, "value_mse_loss_layer_018": 0.036377, "value_mse_loss_layer_019": 0.049805, "value_mse_loss_layer_020": 0.045654, "value_mse_loss_layer_021": 0.063477, "value_mse_loss_layer_022": 0.052734, "value_mse_loss_layer_023": 0.057373, "value_mse_loss_layer_024": 0.059082, "value_mse_loss_layer_025": 0.076172, "value_mse_loss_layer_026": 0.062988, "value_mse_loss_layer_027": 0.083984, "value_mse_loss_layer_028": 0.081055, "value_mse_loss_layer_029": 0.123047, "value_mse_loss_layer_030": 0.112305, "value_mse_loss_layer_031": 0.155273, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.000116, "vq_loss_layer_005": 0.000173, "vq_loss_layer_006": 0.000196, "vq_loss_layer_007": 0.000343, "vq_loss_layer_008": 0.000374, "vq_loss_layer_009": 0.000431, "vq_loss_layer_010": 0.000408, "vq_loss_layer_011": 0.000557, "vq_loss_layer_012": 0.000874, "vq_loss_layer_013": 0.000736, "vq_loss_layer_014": 0.000755, "vq_loss_layer_015": 0.001038, "vq_loss_layer_016": 0.000748, "vq_loss_layer_017": 0.000679, "vq_loss_layer_018": 0.000404, "vq_loss_layer_019": 0.000387, "vq_loss_layer_020": 0.000496, "vq_loss_layer_021": 0.001152, "vq_loss_layer_022": 0.000599, "vq_loss_layer_023": 0.000565, "vq_loss_layer_024": 0.0005, "vq_loss_layer_025": 0.000641, "vq_loss_layer_026": 0.000946, "vq_loss_layer_027": 0.001053, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.002258, "vq_loss_layer_030": 0.003662, "vq_loss_layer_031": 0.009705 }, { "ce_loss": 2.315005, "epoch": 0.00126, "grad_norm": 0.006128319073468447, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.063211, "kv_vq_loss": 0.000969, "learning_rate": 0.0007750926362793906, "loss": 0.064188, "step": 1260, "value_mse_loss_layer_000": 0.001083, "value_mse_loss_layer_001": 0.002945, "value_mse_loss_layer_002": 0.013733, "value_mse_loss_layer_003": 0.019165, "value_mse_loss_layer_004": 0.0177, "value_mse_loss_layer_005": 0.017578, "value_mse_loss_layer_006": 0.019775, "value_mse_loss_layer_007": 0.022949, "value_mse_loss_layer_008": 0.026001, "value_mse_loss_layer_009": 0.035156, "value_mse_loss_layer_010": 0.03125, "value_mse_loss_layer_011": 0.033203, "value_mse_loss_layer_012": 0.035156, "value_mse_loss_layer_013": 0.03418, "value_mse_loss_layer_014": 0.037354, "value_mse_loss_layer_015": 0.037354, "value_mse_loss_layer_016": 0.035156, "value_mse_loss_layer_017": 0.036865, "value_mse_loss_layer_018": 0.034912, "value_mse_loss_layer_019": 0.044189, "value_mse_loss_layer_020": 0.043213, "value_mse_loss_layer_021": 0.05835, "value_mse_loss_layer_022": 0.049316, "value_mse_loss_layer_023": 0.057129, "value_mse_loss_layer_024": 0.062012, "value_mse_loss_layer_025": 0.081543, "value_mse_loss_layer_026": 0.064941, "value_mse_loss_layer_027": 0.086914, "value_mse_loss_layer_028": 0.084961, "value_mse_loss_layer_029": 0.125, "value_mse_loss_layer_030": 0.11377, "value_mse_loss_layer_031": 0.150391, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.000135, "vq_loss_layer_005": 0.000141, "vq_loss_layer_006": 0.000222, "vq_loss_layer_007": 0.000359, "vq_loss_layer_008": 0.000349, "vq_loss_layer_009": 0.000467, "vq_loss_layer_010": 0.000412, "vq_loss_layer_011": 0.000496, "vq_loss_layer_012": 0.000839, "vq_loss_layer_013": 0.000572, "vq_loss_layer_014": 0.000786, "vq_loss_layer_015": 0.00087, "vq_loss_layer_016": 0.000786, "vq_loss_layer_017": 0.00071, "vq_loss_layer_018": 0.000355, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.000391, "vq_loss_layer_021": 0.000931, "vq_loss_layer_022": 0.000488, "vq_loss_layer_023": 0.000576, "vq_loss_layer_024": 0.000576, "vq_loss_layer_025": 0.000725, "vq_loss_layer_026": 0.001022, "vq_loss_layer_027": 0.001053, "vq_loss_layer_028": 0.00148, "vq_loss_layer_029": 0.002411, "vq_loss_layer_030": 0.004181, "vq_loss_layer_031": 0.010315 }, { "ce_loss": 2.275214, "epoch": 0.00127, "grad_norm": 0.005817591678351164, "key_mse_loss_layer_000": 0.005005, "key_mse_loss_layer_001": 0.012878, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.063165, "kv_vq_loss": 0.000927, "learning_rate": 0.0007759509302389891, "loss": 0.064117, "step": 1270, "value_mse_loss_layer_000": 0.001144, "value_mse_loss_layer_001": 0.00293, "value_mse_loss_layer_002": 0.011963, "value_mse_loss_layer_003": 0.019409, "value_mse_loss_layer_004": 0.017944, "value_mse_loss_layer_005": 0.017212, "value_mse_loss_layer_006": 0.021606, "value_mse_loss_layer_007": 0.02356, "value_mse_loss_layer_008": 0.026001, "value_mse_loss_layer_009": 0.034912, "value_mse_loss_layer_010": 0.030762, "value_mse_loss_layer_011": 0.032959, "value_mse_loss_layer_012": 0.034668, "value_mse_loss_layer_013": 0.03418, "value_mse_loss_layer_014": 0.036621, "value_mse_loss_layer_015": 0.036865, "value_mse_loss_layer_016": 0.033691, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.039551, "value_mse_loss_layer_019": 0.043457, "value_mse_loss_layer_020": 0.04541, "value_mse_loss_layer_021": 0.052246, "value_mse_loss_layer_022": 0.049561, "value_mse_loss_layer_023": 0.058105, "value_mse_loss_layer_024": 0.066895, "value_mse_loss_layer_025": 0.085449, "value_mse_loss_layer_026": 0.068359, "value_mse_loss_layer_027": 0.083008, "value_mse_loss_layer_028": 0.080566, "value_mse_loss_layer_029": 0.132812, "value_mse_loss_layer_030": 0.12207, "value_mse_loss_layer_031": 0.15918, "vq_loss_layer_000": 1.7e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.000125, "vq_loss_layer_005": 0.000132, "vq_loss_layer_006": 0.00032, "vq_loss_layer_007": 0.000372, "vq_loss_layer_008": 0.00034, "vq_loss_layer_009": 0.000488, "vq_loss_layer_010": 0.000389, "vq_loss_layer_011": 0.000446, "vq_loss_layer_012": 0.000774, "vq_loss_layer_013": 0.000561, "vq_loss_layer_014": 0.00071, "vq_loss_layer_015": 0.000721, "vq_loss_layer_016": 0.00069, "vq_loss_layer_017": 0.00061, "vq_loss_layer_018": 0.000458, "vq_loss_layer_019": 0.000385, "vq_loss_layer_020": 0.000401, "vq_loss_layer_021": 0.000778, "vq_loss_layer_022": 0.000444, "vq_loss_layer_023": 0.000538, "vq_loss_layer_024": 0.000591, "vq_loss_layer_025": 0.000862, "vq_loss_layer_026": 0.000999, "vq_loss_layer_027": 0.000954, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.00264, "vq_loss_layer_030": 0.003784, "vq_loss_layer_031": 0.010437 }, { "ce_loss": 2.278389, "epoch": 0.00128, "grad_norm": 0.0054645836353302, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.062814, "kv_vq_loss": 0.00091, "learning_rate": 0.0007768024924119669, "loss": 0.063745, "step": 1280, "value_mse_loss_layer_000": 0.001076, "value_mse_loss_layer_001": 0.002945, "value_mse_loss_layer_002": 0.011475, "value_mse_loss_layer_003": 0.018555, "value_mse_loss_layer_004": 0.016479, "value_mse_loss_layer_005": 0.016968, "value_mse_loss_layer_006": 0.019775, "value_mse_loss_layer_007": 0.022461, "value_mse_loss_layer_008": 0.026001, "value_mse_loss_layer_009": 0.03418, "value_mse_loss_layer_010": 0.029663, "value_mse_loss_layer_011": 0.033447, "value_mse_loss_layer_012": 0.033447, "value_mse_loss_layer_013": 0.033936, "value_mse_loss_layer_014": 0.039795, "value_mse_loss_layer_015": 0.037354, "value_mse_loss_layer_016": 0.035156, "value_mse_loss_layer_017": 0.036621, "value_mse_loss_layer_018": 0.034912, "value_mse_loss_layer_019": 0.041748, "value_mse_loss_layer_020": 0.044922, "value_mse_loss_layer_021": 0.05957, "value_mse_loss_layer_022": 0.048584, "value_mse_loss_layer_023": 0.057617, "value_mse_loss_layer_024": 0.0625, "value_mse_loss_layer_025": 0.086426, "value_mse_loss_layer_026": 0.060059, "value_mse_loss_layer_027": 0.085449, "value_mse_loss_layer_028": 0.080078, "value_mse_loss_layer_029": 0.119629, "value_mse_loss_layer_030": 0.106445, "value_mse_loss_layer_031": 0.139648, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.000129, "vq_loss_layer_006": 0.000202, "vq_loss_layer_007": 0.000305, "vq_loss_layer_008": 0.000309, "vq_loss_layer_009": 0.000427, "vq_loss_layer_010": 0.000364, "vq_loss_layer_011": 0.000542, "vq_loss_layer_012": 0.000694, "vq_loss_layer_013": 0.000587, "vq_loss_layer_014": 0.00071, "vq_loss_layer_015": 0.000774, "vq_loss_layer_016": 0.000774, "vq_loss_layer_017": 0.000584, "vq_loss_layer_018": 0.000345, "vq_loss_layer_019": 0.000275, "vq_loss_layer_020": 0.000336, "vq_loss_layer_021": 0.000816, "vq_loss_layer_022": 0.000422, "vq_loss_layer_023": 0.000515, "vq_loss_layer_024": 0.000507, "vq_loss_layer_025": 0.000694, "vq_loss_layer_026": 0.00087, "vq_loss_layer_027": 0.001236, "vq_loss_layer_028": 0.001762, "vq_loss_layer_029": 0.004181, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.011902 }, { "ce_loss": 2.275357, "epoch": 0.00129, "grad_norm": 0.005068148020654917, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.06282, "kv_vq_loss": 0.000923, "learning_rate": 0.000777647427574812, "loss": 0.063748, "step": 1290, "value_mse_loss_layer_000": 0.001068, "value_mse_loss_layer_001": 0.002899, "value_mse_loss_layer_002": 0.011536, "value_mse_loss_layer_003": 0.019897, "value_mse_loss_layer_004": 0.01709, "value_mse_loss_layer_005": 0.016846, "value_mse_loss_layer_006": 0.019409, "value_mse_loss_layer_007": 0.023438, "value_mse_loss_layer_008": 0.026245, "value_mse_loss_layer_009": 0.033691, "value_mse_loss_layer_010": 0.029907, "value_mse_loss_layer_011": 0.031982, "value_mse_loss_layer_012": 0.034912, "value_mse_loss_layer_013": 0.033203, "value_mse_loss_layer_014": 0.036621, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.036621, "value_mse_loss_layer_019": 0.041992, "value_mse_loss_layer_020": 0.042969, "value_mse_loss_layer_021": 0.056396, "value_mse_loss_layer_022": 0.047119, "value_mse_loss_layer_023": 0.058838, "value_mse_loss_layer_024": 0.063477, "value_mse_loss_layer_025": 0.083496, "value_mse_loss_layer_026": 0.061523, "value_mse_loss_layer_027": 0.081543, "value_mse_loss_layer_028": 0.081543, "value_mse_loss_layer_029": 0.122559, "value_mse_loss_layer_030": 0.108887, "value_mse_loss_layer_031": 0.144531, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 0.000122, "vq_loss_layer_005": 0.000127, "vq_loss_layer_006": 0.000198, "vq_loss_layer_007": 0.000364, "vq_loss_layer_008": 0.000349, "vq_loss_layer_009": 0.000439, "vq_loss_layer_010": 0.000376, "vq_loss_layer_011": 0.00041, "vq_loss_layer_012": 0.000858, "vq_loss_layer_013": 0.000538, "vq_loss_layer_014": 0.000694, "vq_loss_layer_015": 0.000656, "vq_loss_layer_016": 0.000679, "vq_loss_layer_017": 0.000641, "vq_loss_layer_018": 0.000414, "vq_loss_layer_019": 0.000332, "vq_loss_layer_020": 0.000401, "vq_loss_layer_021": 0.000725, "vq_loss_layer_022": 0.00038, "vq_loss_layer_023": 0.000549, "vq_loss_layer_024": 0.000511, "vq_loss_layer_025": 0.000595, "vq_loss_layer_026": 0.000744, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001854, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.008484 }, { "ce_loss": 2.326254, "epoch": 0.0013, "grad_norm": 0.006186963990330696, "key_mse_loss_layer_000": 0.003845, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.062778, "kv_vq_loss": 0.000904, "learning_rate": 0.0007784858380767091, "loss": 0.06373, "step": 1300, "value_mse_loss_layer_000": 0.001091, "value_mse_loss_layer_001": 0.002945, "value_mse_loss_layer_002": 0.012939, "value_mse_loss_layer_003": 0.018433, "value_mse_loss_layer_004": 0.015869, "value_mse_loss_layer_005": 0.016113, "value_mse_loss_layer_006": 0.02124, "value_mse_loss_layer_007": 0.021973, "value_mse_loss_layer_008": 0.025879, "value_mse_loss_layer_009": 0.034668, "value_mse_loss_layer_010": 0.029541, "value_mse_loss_layer_011": 0.031738, "value_mse_loss_layer_012": 0.031982, "value_mse_loss_layer_013": 0.033936, "value_mse_loss_layer_014": 0.035645, "value_mse_loss_layer_015": 0.041992, "value_mse_loss_layer_016": 0.037354, "value_mse_loss_layer_017": 0.03833, "value_mse_loss_layer_018": 0.0354, "value_mse_loss_layer_019": 0.040771, "value_mse_loss_layer_020": 0.05249, "value_mse_loss_layer_021": 0.054932, "value_mse_loss_layer_022": 0.049805, "value_mse_loss_layer_023": 0.057129, "value_mse_loss_layer_024": 0.06543, "value_mse_loss_layer_025": 0.075684, "value_mse_loss_layer_026": 0.064453, "value_mse_loss_layer_027": 0.088379, "value_mse_loss_layer_028": 0.078613, "value_mse_loss_layer_029": 0.129883, "value_mse_loss_layer_030": 0.108887, "value_mse_loss_layer_031": 0.140625, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000109, "vq_loss_layer_006": 0.000294, "vq_loss_layer_007": 0.000309, "vq_loss_layer_008": 0.000309, "vq_loss_layer_009": 0.000519, "vq_loss_layer_010": 0.000349, "vq_loss_layer_011": 0.00041, "vq_loss_layer_012": 0.000664, "vq_loss_layer_013": 0.000565, "vq_loss_layer_014": 0.000618, "vq_loss_layer_015": 0.001091, "vq_loss_layer_016": 0.000805, "vq_loss_layer_017": 0.000767, "vq_loss_layer_018": 0.000338, "vq_loss_layer_019": 0.000269, "vq_loss_layer_020": 0.000376, "vq_loss_layer_021": 0.000683, "vq_loss_layer_022": 0.000372, "vq_loss_layer_023": 0.000454, "vq_loss_layer_024": 0.000523, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.00087, "vq_loss_layer_027": 0.001106, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.002304, "vq_loss_layer_030": 0.00293, "vq_loss_layer_031": 0.007812 }, { "ce_loss": 2.290579, "epoch": 0.00131, "grad_norm": 0.00520952045917511, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.099121, "key_mse_loss_layer_010": 0.111816, "key_mse_loss_layer_011": 0.106934, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.146484, "key_mse_loss_layer_014": 0.142578, "key_mse_loss_layer_015": 0.128906, "key_mse_loss_layer_016": 0.125977, "key_mse_loss_layer_017": 0.124023, "key_mse_loss_layer_018": 0.132812, "key_mse_loss_layer_019": 0.103516, "key_mse_loss_layer_020": 0.121094, "key_mse_loss_layer_021": 0.11377, "key_mse_loss_layer_022": 0.120117, "key_mse_loss_layer_023": 0.120117, "key_mse_loss_layer_024": 0.094727, "key_mse_loss_layer_025": 0.086426, "key_mse_loss_layer_026": 0.103516, "key_mse_loss_layer_027": 0.098633, "key_mse_loss_layer_028": 0.107422, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.106445, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.062817, "kv_vq_loss": 0.00093, "learning_rate": 0.0007793178239139409, "loss": 0.06376, "step": 1310, "value_mse_loss_layer_000": 0.001045, "value_mse_loss_layer_001": 0.002838, "value_mse_loss_layer_002": 0.011841, "value_mse_loss_layer_003": 0.018311, "value_mse_loss_layer_004": 0.017334, "value_mse_loss_layer_005": 0.017212, "value_mse_loss_layer_006": 0.019897, "value_mse_loss_layer_007": 0.022827, "value_mse_loss_layer_008": 0.02417, "value_mse_loss_layer_009": 0.03125, "value_mse_loss_layer_010": 0.02771, "value_mse_loss_layer_011": 0.030884, "value_mse_loss_layer_012": 0.032227, "value_mse_loss_layer_013": 0.031006, "value_mse_loss_layer_014": 0.035156, "value_mse_loss_layer_015": 0.032227, "value_mse_loss_layer_016": 0.029663, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.033447, "value_mse_loss_layer_019": 0.038086, "value_mse_loss_layer_020": 0.042236, "value_mse_loss_layer_021": 0.047363, "value_mse_loss_layer_022": 0.04126, "value_mse_loss_layer_023": 0.055176, "value_mse_loss_layer_024": 0.054443, "value_mse_loss_layer_025": 0.069824, "value_mse_loss_layer_026": 0.061523, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.102051, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.144531, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 2.9e-05, "vq_loss_layer_002": 3.9e-05, "vq_loss_layer_003": 6.9e-05, "vq_loss_layer_004": 0.000132, "vq_loss_layer_005": 0.000152, "vq_loss_layer_006": 0.00028, "vq_loss_layer_007": 0.000357, "vq_loss_layer_008": 0.000368, "vq_loss_layer_009": 0.000414, "vq_loss_layer_010": 0.000418, "vq_loss_layer_011": 0.000504, "vq_loss_layer_012": 0.000919, "vq_loss_layer_013": 0.000511, "vq_loss_layer_014": 0.000927, "vq_loss_layer_015": 0.000648, "vq_loss_layer_016": 0.000679, "vq_loss_layer_017": 0.000607, "vq_loss_layer_018": 0.000418, "vq_loss_layer_019": 0.000324, "vq_loss_layer_020": 0.000481, "vq_loss_layer_021": 0.000824, "vq_loss_layer_022": 0.000437, "vq_loss_layer_023": 0.000912, "vq_loss_layer_024": 0.000603, "vq_loss_layer_025": 0.001015, "vq_loss_layer_026": 0.001434, "vq_loss_layer_027": 0.001205, "vq_loss_layer_028": 0.002121, "vq_loss_layer_029": 0.00235, "vq_loss_layer_030": 0.004181, "vq_loss_layer_031": 0.011841 }, { "ce_loss": 2.305072, "epoch": 0.00132, "grad_norm": 0.007045458070933819, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.062817, "kv_vq_loss": 0.000926, "learning_rate": 0.0007801434828014624, "loss": 0.063763, "step": 1320, "value_mse_loss_layer_000": 0.001038, "value_mse_loss_layer_001": 0.002869, "value_mse_loss_layer_002": 0.011597, "value_mse_loss_layer_003": 0.022095, "value_mse_loss_layer_004": 0.017822, "value_mse_loss_layer_005": 0.017456, "value_mse_loss_layer_006": 0.019165, "value_mse_loss_layer_007": 0.021851, "value_mse_loss_layer_008": 0.025024, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.028809, "value_mse_loss_layer_011": 0.030029, "value_mse_loss_layer_012": 0.030884, "value_mse_loss_layer_013": 0.032227, "value_mse_loss_layer_014": 0.0354, "value_mse_loss_layer_015": 0.03418, "value_mse_loss_layer_016": 0.03125, "value_mse_loss_layer_017": 0.034668, "value_mse_loss_layer_018": 0.039795, "value_mse_loss_layer_019": 0.040771, "value_mse_loss_layer_020": 0.041748, "value_mse_loss_layer_021": 0.053467, "value_mse_loss_layer_022": 0.046143, "value_mse_loss_layer_023": 0.056396, "value_mse_loss_layer_024": 0.073242, "value_mse_loss_layer_025": 0.089844, "value_mse_loss_layer_026": 0.065918, "value_mse_loss_layer_027": 0.086914, "value_mse_loss_layer_028": 0.092773, "value_mse_loss_layer_029": 0.145508, "value_mse_loss_layer_030": 0.121582, "value_mse_loss_layer_031": 0.15625, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 7.4e-05, "vq_loss_layer_004": 0.000124, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000196, "vq_loss_layer_007": 0.000303, "vq_loss_layer_008": 0.000349, "vq_loss_layer_009": 0.000401, "vq_loss_layer_010": 0.000446, "vq_loss_layer_011": 0.000395, "vq_loss_layer_012": 0.000664, "vq_loss_layer_013": 0.000626, "vq_loss_layer_014": 0.00069, "vq_loss_layer_015": 0.000626, "vq_loss_layer_016": 0.000664, "vq_loss_layer_017": 0.000584, "vq_loss_layer_018": 0.000479, "vq_loss_layer_019": 0.000322, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000721, "vq_loss_layer_022": 0.000366, "vq_loss_layer_023": 0.000486, "vq_loss_layer_024": 0.000717, "vq_loss_layer_025": 0.000881, "vq_loss_layer_026": 0.001007, "vq_loss_layer_027": 0.001266, "vq_loss_layer_028": 0.002014, "vq_loss_layer_029": 0.003265, "vq_loss_layer_030": 0.004578, "vq_loss_layer_031": 0.01178 }, { "ce_loss": 2.287015, "epoch": 0.00133, "grad_norm": 0.005170387681573629, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.063455, "kv_vq_loss": 0.000938, "learning_rate": 0.0007809629102417713, "loss": 0.064417, "step": 1330, "value_mse_loss_layer_000": 0.001038, "value_mse_loss_layer_001": 0.002838, "value_mse_loss_layer_002": 0.011353, "value_mse_loss_layer_003": 0.020264, "value_mse_loss_layer_004": 0.01709, "value_mse_loss_layer_005": 0.016235, "value_mse_loss_layer_006": 0.020508, "value_mse_loss_layer_007": 0.022339, "value_mse_loss_layer_008": 0.025757, "value_mse_loss_layer_009": 0.034668, "value_mse_loss_layer_010": 0.028809, "value_mse_loss_layer_011": 0.031738, "value_mse_loss_layer_012": 0.031494, "value_mse_loss_layer_013": 0.032959, "value_mse_loss_layer_014": 0.0354, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.032227, "value_mse_loss_layer_017": 0.035645, "value_mse_loss_layer_018": 0.036621, "value_mse_loss_layer_019": 0.04248, "value_mse_loss_layer_020": 0.047119, "value_mse_loss_layer_021": 0.054199, "value_mse_loss_layer_022": 0.048828, "value_mse_loss_layer_023": 0.062988, "value_mse_loss_layer_024": 0.070312, "value_mse_loss_layer_025": 0.080078, "value_mse_loss_layer_026": 0.073242, "value_mse_loss_layer_027": 0.095703, "value_mse_loss_layer_028": 0.088379, "value_mse_loss_layer_029": 0.129883, "value_mse_loss_layer_030": 0.117188, "value_mse_loss_layer_031": 0.149414, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 0.000114, "vq_loss_layer_005": 0.000108, "vq_loss_layer_006": 0.000277, "vq_loss_layer_007": 0.000334, "vq_loss_layer_008": 0.000317, "vq_loss_layer_009": 0.000492, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000429, "vq_loss_layer_012": 0.000622, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000618, "vq_loss_layer_015": 0.00066, "vq_loss_layer_016": 0.00066, "vq_loss_layer_017": 0.000523, "vq_loss_layer_018": 0.000372, "vq_loss_layer_019": 0.000311, "vq_loss_layer_020": 0.000317, "vq_loss_layer_021": 0.000637, "vq_loss_layer_022": 0.000372, "vq_loss_layer_023": 0.000538, "vq_loss_layer_024": 0.000486, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.000961, "vq_loss_layer_027": 0.001015, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.002182, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.008789 }, { "ce_loss": 2.256926, "epoch": 0.00134, "grad_norm": 0.005553320050239563, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.062817, "kv_vq_loss": 0.000923, "learning_rate": 0.0007817761995912018, "loss": 0.063736, "step": 1340, "value_mse_loss_layer_000": 0.001083, "value_mse_loss_layer_001": 0.00293, "value_mse_loss_layer_002": 0.013184, "value_mse_loss_layer_003": 0.018311, "value_mse_loss_layer_004": 0.017212, "value_mse_loss_layer_005": 0.016968, "value_mse_loss_layer_006": 0.020142, "value_mse_loss_layer_007": 0.022461, "value_mse_loss_layer_008": 0.026001, "value_mse_loss_layer_009": 0.034424, "value_mse_loss_layer_010": 0.029053, "value_mse_loss_layer_011": 0.031494, "value_mse_loss_layer_012": 0.032715, "value_mse_loss_layer_013": 0.032959, "value_mse_loss_layer_014": 0.036133, "value_mse_loss_layer_015": 0.037842, "value_mse_loss_layer_016": 0.032715, "value_mse_loss_layer_017": 0.036865, "value_mse_loss_layer_018": 0.035645, "value_mse_loss_layer_019": 0.044922, "value_mse_loss_layer_020": 0.043213, "value_mse_loss_layer_021": 0.050293, "value_mse_loss_layer_022": 0.051514, "value_mse_loss_layer_023": 0.05542, "value_mse_loss_layer_024": 0.058105, "value_mse_loss_layer_025": 0.080566, "value_mse_loss_layer_026": 0.061768, "value_mse_loss_layer_027": 0.084473, "value_mse_loss_layer_028": 0.081543, "value_mse_loss_layer_029": 0.114746, "value_mse_loss_layer_030": 0.10498, "value_mse_loss_layer_031": 0.145508, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 4.8e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.000128, "vq_loss_layer_006": 0.000241, "vq_loss_layer_007": 0.000309, "vq_loss_layer_008": 0.000357, "vq_loss_layer_009": 0.000463, "vq_loss_layer_010": 0.000399, "vq_loss_layer_011": 0.000467, "vq_loss_layer_012": 0.000721, "vq_loss_layer_013": 0.000526, "vq_loss_layer_014": 0.000782, "vq_loss_layer_015": 0.000771, "vq_loss_layer_016": 0.000748, "vq_loss_layer_017": 0.000809, "vq_loss_layer_018": 0.000389, "vq_loss_layer_019": 0.000359, "vq_loss_layer_020": 0.000347, "vq_loss_layer_021": 0.000805, "vq_loss_layer_022": 0.000549, "vq_loss_layer_023": 0.000584, "vq_loss_layer_024": 0.000504, "vq_loss_layer_025": 0.000748, "vq_loss_layer_026": 0.001007, "vq_loss_layer_027": 0.001259, "vq_loss_layer_028": 0.001549, "vq_loss_layer_029": 0.002411, "vq_loss_layer_030": 0.003601, "vq_loss_layer_031": 0.010681 }, { "ce_loss": 2.319294, "epoch": 0.00135, "grad_norm": 0.00575883686542511, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.062527, "kv_vq_loss": 0.00092, "learning_rate": 0.0007825834421237515, "loss": 0.063467, "step": 1350, "value_mse_loss_layer_000": 0.00106, "value_mse_loss_layer_001": 0.002869, "value_mse_loss_layer_002": 0.011353, "value_mse_loss_layer_003": 0.0177, "value_mse_loss_layer_004": 0.016846, "value_mse_loss_layer_005": 0.017944, "value_mse_loss_layer_006": 0.019287, "value_mse_loss_layer_007": 0.022095, "value_mse_loss_layer_008": 0.025513, "value_mse_loss_layer_009": 0.033447, "value_mse_loss_layer_010": 0.029175, "value_mse_loss_layer_011": 0.03125, "value_mse_loss_layer_012": 0.033691, "value_mse_loss_layer_013": 0.032471, "value_mse_loss_layer_014": 0.034668, "value_mse_loss_layer_015": 0.036621, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.035889, "value_mse_loss_layer_018": 0.036133, "value_mse_loss_layer_019": 0.040283, "value_mse_loss_layer_020": 0.044922, "value_mse_loss_layer_021": 0.056396, "value_mse_loss_layer_022": 0.049072, "value_mse_loss_layer_023": 0.062988, "value_mse_loss_layer_024": 0.063477, "value_mse_loss_layer_025": 0.078613, "value_mse_loss_layer_026": 0.062256, "value_mse_loss_layer_027": 0.086914, "value_mse_loss_layer_028": 0.080566, "value_mse_loss_layer_029": 0.120117, "value_mse_loss_layer_030": 0.116699, "value_mse_loss_layer_031": 0.146484, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 0.000127, "vq_loss_layer_005": 0.000162, "vq_loss_layer_006": 0.0002, "vq_loss_layer_007": 0.00033, "vq_loss_layer_008": 0.000298, "vq_loss_layer_009": 0.000397, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000399, "vq_loss_layer_012": 0.000801, "vq_loss_layer_013": 0.000515, "vq_loss_layer_014": 0.00061, "vq_loss_layer_015": 0.000698, "vq_loss_layer_016": 0.000679, "vq_loss_layer_017": 0.000572, "vq_loss_layer_018": 0.000345, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000322, "vq_loss_layer_021": 0.000725, "vq_loss_layer_022": 0.000357, "vq_loss_layer_023": 0.000561, "vq_loss_layer_024": 0.000414, "vq_loss_layer_025": 0.000483, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.000767, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001915, "vq_loss_layer_030": 0.002747, "vq_loss_layer_031": 0.008423 }, { "ce_loss": 2.285877, "epoch": 0.00136, "grad_norm": 0.00967082567512989, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.11084, "key_mse_loss_layer_016": 0.103027, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.062839, "kv_vq_loss": 0.000905, "learning_rate": 0.0007833847270925543, "loss": 0.063776, "step": 1360, "value_mse_loss_layer_000": 0.001114, "value_mse_loss_layer_001": 0.00293, "value_mse_loss_layer_002": 0.01239, "value_mse_loss_layer_003": 0.018677, "value_mse_loss_layer_004": 0.017334, "value_mse_loss_layer_005": 0.016968, "value_mse_loss_layer_006": 0.019775, "value_mse_loss_layer_007": 0.022217, "value_mse_loss_layer_008": 0.026367, "value_mse_loss_layer_009": 0.033447, "value_mse_loss_layer_010": 0.029053, "value_mse_loss_layer_011": 0.031006, "value_mse_loss_layer_012": 0.031494, "value_mse_loss_layer_013": 0.032471, "value_mse_loss_layer_014": 0.033936, "value_mse_loss_layer_015": 0.034668, "value_mse_loss_layer_016": 0.036621, "value_mse_loss_layer_017": 0.035156, "value_mse_loss_layer_018": 0.032715, "value_mse_loss_layer_019": 0.040039, "value_mse_loss_layer_020": 0.042236, "value_mse_loss_layer_021": 0.048828, "value_mse_loss_layer_022": 0.044922, "value_mse_loss_layer_023": 0.051514, "value_mse_loss_layer_024": 0.058105, "value_mse_loss_layer_025": 0.077148, "value_mse_loss_layer_026": 0.058838, "value_mse_loss_layer_027": 0.080078, "value_mse_loss_layer_028": 0.074219, "value_mse_loss_layer_029": 0.183594, "value_mse_loss_layer_030": 0.108887, "value_mse_loss_layer_031": 0.144531, "vq_loss_layer_000": 1.6e-05, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000246, "vq_loss_layer_007": 0.000317, "vq_loss_layer_008": 0.000452, "vq_loss_layer_009": 0.00045, "vq_loss_layer_010": 0.000418, "vq_loss_layer_011": 0.000454, "vq_loss_layer_012": 0.000698, "vq_loss_layer_013": 0.000496, "vq_loss_layer_014": 0.000748, "vq_loss_layer_015": 0.000648, "vq_loss_layer_016": 0.000984, "vq_loss_layer_017": 0.000607, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000395, "vq_loss_layer_020": 0.000355, "vq_loss_layer_021": 0.000755, "vq_loss_layer_022": 0.000401, "vq_loss_layer_023": 0.000526, "vq_loss_layer_024": 0.000507, "vq_loss_layer_025": 0.000813, "vq_loss_layer_026": 0.000927, "vq_loss_layer_027": 0.001015, "vq_loss_layer_028": 0.001373, "vq_loss_layer_029": 0.004669, "vq_loss_layer_030": 0.004089, "vq_loss_layer_031": 0.010986 }, { "ce_loss": 2.21534, "epoch": 0.00137, "grad_norm": 0.005496214609593153, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.063138, "kv_vq_loss": 0.000951, "learning_rate": 0.0007841801417891016, "loss": 0.064111, "step": 1370, "value_mse_loss_layer_000": 0.001022, "value_mse_loss_layer_001": 0.002884, "value_mse_loss_layer_002": 0.011353, "value_mse_loss_layer_003": 0.018066, "value_mse_loss_layer_004": 0.016235, "value_mse_loss_layer_005": 0.015991, "value_mse_loss_layer_006": 0.019165, "value_mse_loss_layer_007": 0.021606, "value_mse_loss_layer_008": 0.025269, "value_mse_loss_layer_009": 0.033691, "value_mse_loss_layer_010": 0.028198, "value_mse_loss_layer_011": 0.030396, "value_mse_loss_layer_012": 0.031128, "value_mse_loss_layer_013": 0.031738, "value_mse_loss_layer_014": 0.035645, "value_mse_loss_layer_015": 0.0354, "value_mse_loss_layer_016": 0.031982, "value_mse_loss_layer_017": 0.034912, "value_mse_loss_layer_018": 0.036377, "value_mse_loss_layer_019": 0.039795, "value_mse_loss_layer_020": 0.044189, "value_mse_loss_layer_021": 0.048828, "value_mse_loss_layer_022": 0.048584, "value_mse_loss_layer_023": 0.056152, "value_mse_loss_layer_024": 0.063477, "value_mse_loss_layer_025": 0.073242, "value_mse_loss_layer_026": 0.063965, "value_mse_loss_layer_027": 0.086914, "value_mse_loss_layer_028": 0.079102, "value_mse_loss_layer_029": 0.126953, "value_mse_loss_layer_030": 0.109375, "value_mse_loss_layer_031": 0.155273, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.000122, "vq_loss_layer_005": 0.000117, "vq_loss_layer_006": 0.000194, "vq_loss_layer_007": 0.000309, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.000446, "vq_loss_layer_010": 0.000322, "vq_loss_layer_011": 0.000389, "vq_loss_layer_012": 0.000652, "vq_loss_layer_013": 0.000504, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000633, "vq_loss_layer_016": 0.000683, "vq_loss_layer_017": 0.000595, "vq_loss_layer_018": 0.000366, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000391, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.000366, "vq_loss_layer_023": 0.00046, "vq_loss_layer_024": 0.000479, "vq_loss_layer_025": 0.000439, "vq_loss_layer_026": 0.000725, "vq_loss_layer_027": 0.000828, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.002075, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.009277 }, { "ce_loss": 2.278827, "epoch": 0.00138, "grad_norm": 0.005460513290017843, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.063284, "kv_vq_loss": 0.000927, "learning_rate": 0.000784969771600309, "loss": 0.06423, "step": 1380, "value_mse_loss_layer_000": 0.001053, "value_mse_loss_layer_001": 0.002884, "value_mse_loss_layer_002": 0.012024, "value_mse_loss_layer_003": 0.018555, "value_mse_loss_layer_004": 0.015747, "value_mse_loss_layer_005": 0.016724, "value_mse_loss_layer_006": 0.022583, "value_mse_loss_layer_007": 0.021484, "value_mse_loss_layer_008": 0.025635, "value_mse_loss_layer_009": 0.033203, "value_mse_loss_layer_010": 0.028564, "value_mse_loss_layer_011": 0.032227, "value_mse_loss_layer_012": 0.032227, "value_mse_loss_layer_013": 0.033447, "value_mse_loss_layer_014": 0.034424, "value_mse_loss_layer_015": 0.035645, "value_mse_loss_layer_016": 0.031982, "value_mse_loss_layer_017": 0.036377, "value_mse_loss_layer_018": 0.0354, "value_mse_loss_layer_019": 0.039795, "value_mse_loss_layer_020": 0.04126, "value_mse_loss_layer_021": 0.058594, "value_mse_loss_layer_022": 0.049316, "value_mse_loss_layer_023": 0.061035, "value_mse_loss_layer_024": 0.066895, "value_mse_loss_layer_025": 0.075684, "value_mse_loss_layer_026": 0.060303, "value_mse_loss_layer_027": 0.089844, "value_mse_loss_layer_028": 0.080078, "value_mse_loss_layer_029": 0.128906, "value_mse_loss_layer_030": 0.105469, "value_mse_loss_layer_031": 0.138672, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000368, "vq_loss_layer_007": 0.000299, "vq_loss_layer_008": 0.00033, "vq_loss_layer_009": 0.000404, "vq_loss_layer_010": 0.00032, "vq_loss_layer_011": 0.000456, "vq_loss_layer_012": 0.000694, "vq_loss_layer_013": 0.000572, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000629, "vq_loss_layer_017": 0.000595, "vq_loss_layer_018": 0.00033, "vq_loss_layer_019": 0.000241, "vq_loss_layer_020": 0.000309, "vq_loss_layer_021": 0.000729, "vq_loss_layer_022": 0.000402, "vq_loss_layer_023": 0.000519, "vq_loss_layer_024": 0.000462, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.000954, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.00209, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.007874 }, { "ce_loss": 2.270974, "epoch": 0.00139, "grad_norm": 0.005115075968205929, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.062024, "kv_vq_loss": 0.000886, "learning_rate": 0.0007857537000635237, "loss": 0.062918, "step": 1390, "value_mse_loss_layer_000": 0.001022, "value_mse_loss_layer_001": 0.002823, "value_mse_loss_layer_002": 0.011719, "value_mse_loss_layer_003": 0.018921, "value_mse_loss_layer_004": 0.016602, "value_mse_loss_layer_005": 0.016968, "value_mse_loss_layer_006": 0.019531, "value_mse_loss_layer_007": 0.022095, "value_mse_loss_layer_008": 0.025635, "value_mse_loss_layer_009": 0.033203, "value_mse_loss_layer_010": 0.029663, "value_mse_loss_layer_011": 0.031494, "value_mse_loss_layer_012": 0.032715, "value_mse_loss_layer_013": 0.032959, "value_mse_loss_layer_014": 0.035645, "value_mse_loss_layer_015": 0.036621, "value_mse_loss_layer_016": 0.036865, "value_mse_loss_layer_017": 0.03418, "value_mse_loss_layer_018": 0.034668, "value_mse_loss_layer_019": 0.040283, "value_mse_loss_layer_020": 0.044678, "value_mse_loss_layer_021": 0.051758, "value_mse_loss_layer_022": 0.047852, "value_mse_loss_layer_023": 0.053467, "value_mse_loss_layer_024": 0.058838, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.061279, "value_mse_loss_layer_027": 0.081055, "value_mse_loss_layer_028": 0.077637, "value_mse_loss_layer_029": 0.121582, "value_mse_loss_layer_030": 0.109863, "value_mse_loss_layer_031": 0.148438, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 0.00011, "vq_loss_layer_005": 0.000131, "vq_loss_layer_006": 0.000218, "vq_loss_layer_007": 0.000317, "vq_loss_layer_008": 0.00036, "vq_loss_layer_009": 0.000422, "vq_loss_layer_010": 0.000437, "vq_loss_layer_011": 0.000435, "vq_loss_layer_012": 0.000706, "vq_loss_layer_013": 0.000572, "vq_loss_layer_014": 0.000729, "vq_loss_layer_015": 0.000778, "vq_loss_layer_016": 0.000965, "vq_loss_layer_017": 0.000584, "vq_loss_layer_018": 0.000368, "vq_loss_layer_019": 0.000343, "vq_loss_layer_020": 0.000418, "vq_loss_layer_021": 0.000828, "vq_loss_layer_022": 0.000488, "vq_loss_layer_023": 0.000507, "vq_loss_layer_024": 0.000511, "vq_loss_layer_025": 0.000584, "vq_loss_layer_026": 0.0009, "vq_loss_layer_027": 0.000942, "vq_loss_layer_028": 0.001312, "vq_loss_layer_029": 0.002594, "vq_loss_layer_030": 0.003555, "vq_loss_layer_031": 0.009827 }, { "ce_loss": 2.298284, "epoch": 0.0014, "grad_norm": 0.0063836537301540375, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.062201, "kv_vq_loss": 0.000884, "learning_rate": 0.0007865320089195594, "loss": 0.063095, "step": 1400, "value_mse_loss_layer_000": 0.001015, "value_mse_loss_layer_001": 0.002792, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.018555, "value_mse_loss_layer_004": 0.016479, "value_mse_loss_layer_005": 0.016479, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.021973, "value_mse_loss_layer_008": 0.025146, "value_mse_loss_layer_009": 0.035645, "value_mse_loss_layer_010": 0.029419, "value_mse_loss_layer_011": 0.033203, "value_mse_loss_layer_012": 0.031006, "value_mse_loss_layer_013": 0.032227, "value_mse_loss_layer_014": 0.034668, "value_mse_loss_layer_015": 0.034668, "value_mse_loss_layer_016": 0.032715, "value_mse_loss_layer_017": 0.035156, "value_mse_loss_layer_018": 0.036133, "value_mse_loss_layer_019": 0.041504, "value_mse_loss_layer_020": 0.045166, "value_mse_loss_layer_021": 0.049316, "value_mse_loss_layer_022": 0.046631, "value_mse_loss_layer_023": 0.070312, "value_mse_loss_layer_024": 0.061768, "value_mse_loss_layer_025": 0.074707, "value_mse_loss_layer_026": 0.06543, "value_mse_loss_layer_027": 0.080566, "value_mse_loss_layer_028": 0.082031, "value_mse_loss_layer_029": 0.125, "value_mse_loss_layer_030": 0.10791, "value_mse_loss_layer_031": 0.143555, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 0.000115, "vq_loss_layer_005": 0.000126, "vq_loss_layer_006": 0.0002, "vq_loss_layer_007": 0.00032, "vq_loss_layer_008": 0.00032, "vq_loss_layer_009": 0.000576, "vq_loss_layer_010": 0.000395, "vq_loss_layer_011": 0.000568, "vq_loss_layer_012": 0.000637, "vq_loss_layer_013": 0.000496, "vq_loss_layer_014": 0.000683, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000706, "vq_loss_layer_017": 0.000553, "vq_loss_layer_018": 0.000401, "vq_loss_layer_019": 0.000315, "vq_loss_layer_020": 0.000397, "vq_loss_layer_021": 0.000607, "vq_loss_layer_022": 0.000383, "vq_loss_layer_023": 0.000832, "vq_loss_layer_024": 0.000467, "vq_loss_layer_025": 0.000591, "vq_loss_layer_026": 0.000927, "vq_loss_layer_027": 0.000828, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.002304, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.008911 }, { "ce_loss": 2.252765, "epoch": 0.00141, "grad_norm": 0.005909788887947798, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.104004, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.062915, "kv_vq_loss": 0.000976, "learning_rate": 0.000787304778163845, "loss": 0.063919, "step": 1410, "value_mse_loss_layer_000": 0.001015, "value_mse_loss_layer_001": 0.002838, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.020508, "value_mse_loss_layer_004": 0.016235, "value_mse_loss_layer_005": 0.016724, "value_mse_loss_layer_006": 0.021118, "value_mse_loss_layer_007": 0.021973, "value_mse_loss_layer_008": 0.025146, "value_mse_loss_layer_009": 0.032959, "value_mse_loss_layer_010": 0.028198, "value_mse_loss_layer_011": 0.03125, "value_mse_loss_layer_012": 0.031982, "value_mse_loss_layer_013": 0.032471, "value_mse_loss_layer_014": 0.034424, "value_mse_loss_layer_015": 0.034912, "value_mse_loss_layer_016": 0.030029, "value_mse_loss_layer_017": 0.033936, "value_mse_loss_layer_018": 0.043213, "value_mse_loss_layer_019": 0.045654, "value_mse_loss_layer_020": 0.041748, "value_mse_loss_layer_021": 0.051025, "value_mse_loss_layer_022": 0.04541, "value_mse_loss_layer_023": 0.055908, "value_mse_loss_layer_024": 0.057373, "value_mse_loss_layer_025": 0.075195, "value_mse_loss_layer_026": 0.061523, "value_mse_loss_layer_027": 0.081055, "value_mse_loss_layer_028": 0.07959, "value_mse_loss_layer_029": 0.116699, "value_mse_loss_layer_030": 0.108398, "value_mse_loss_layer_031": 0.15918, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 9.8e-05, "vq_loss_layer_005": 0.000126, "vq_loss_layer_006": 0.000299, "vq_loss_layer_007": 0.00032, "vq_loss_layer_008": 0.000343, "vq_loss_layer_009": 0.000462, "vq_loss_layer_010": 0.000353, "vq_loss_layer_011": 0.000439, "vq_loss_layer_012": 0.000793, "vq_loss_layer_013": 0.000507, "vq_loss_layer_014": 0.000717, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000591, "vq_loss_layer_017": 0.000553, "vq_loss_layer_018": 0.000534, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000328, "vq_loss_layer_021": 0.000732, "vq_loss_layer_022": 0.00037, "vq_loss_layer_023": 0.00053, "vq_loss_layer_024": 0.000431, "vq_loss_layer_025": 0.000603, "vq_loss_layer_026": 0.00087, "vq_loss_layer_027": 0.00095, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.00206, "vq_loss_layer_030": 0.003143, "vq_loss_layer_031": 0.010376 }, { "ce_loss": 2.301707, "epoch": 0.00142, "grad_norm": 0.005476841237396002, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.062277, "kv_vq_loss": 0.000934, "learning_rate": 0.0007880720860957641, "loss": 0.063239, "step": 1420, "value_mse_loss_layer_000": 0.001038, "value_mse_loss_layer_001": 0.002823, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.017822, "value_mse_loss_layer_004": 0.017456, "value_mse_loss_layer_005": 0.016602, "value_mse_loss_layer_006": 0.019531, "value_mse_loss_layer_007": 0.022461, "value_mse_loss_layer_008": 0.025269, "value_mse_loss_layer_009": 0.0354, "value_mse_loss_layer_010": 0.028809, "value_mse_loss_layer_011": 0.031006, "value_mse_loss_layer_012": 0.032471, "value_mse_loss_layer_013": 0.032471, "value_mse_loss_layer_014": 0.035156, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.033203, "value_mse_loss_layer_017": 0.035889, "value_mse_loss_layer_018": 0.0354, "value_mse_loss_layer_019": 0.041016, "value_mse_loss_layer_020": 0.04126, "value_mse_loss_layer_021": 0.051514, "value_mse_loss_layer_022": 0.051758, "value_mse_loss_layer_023": 0.057861, "value_mse_loss_layer_024": 0.060547, "value_mse_loss_layer_025": 0.077637, "value_mse_loss_layer_026": 0.081543, "value_mse_loss_layer_027": 0.083008, "value_mse_loss_layer_028": 0.082031, "value_mse_loss_layer_029": 0.122559, "value_mse_loss_layer_030": 0.108887, "value_mse_loss_layer_031": 0.144531, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000227, "vq_loss_layer_007": 0.000349, "vq_loss_layer_008": 0.00033, "vq_loss_layer_009": 0.000587, "vq_loss_layer_010": 0.000412, "vq_loss_layer_011": 0.000412, "vq_loss_layer_012": 0.00069, "vq_loss_layer_013": 0.000549, "vq_loss_layer_014": 0.000668, "vq_loss_layer_015": 0.000698, "vq_loss_layer_016": 0.000793, "vq_loss_layer_017": 0.00066, "vq_loss_layer_018": 0.000408, "vq_loss_layer_019": 0.000324, "vq_loss_layer_020": 0.000374, "vq_loss_layer_021": 0.00082, "vq_loss_layer_022": 0.000565, "vq_loss_layer_023": 0.000603, "vq_loss_layer_024": 0.000576, "vq_loss_layer_025": 0.000809, "vq_loss_layer_026": 0.001907, "vq_loss_layer_027": 0.001129, "vq_loss_layer_028": 0.001724, "vq_loss_layer_029": 0.002853, "vq_loss_layer_030": 0.004089, "vq_loss_layer_031": 0.010803 }, { "ce_loss": 2.293708, "epoch": 0.00143, "grad_norm": 0.006045075133442879, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.105957, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.062726, "kv_vq_loss": 0.000896, "learning_rate": 0.0007888340093662653, "loss": 0.063617, "step": 1430, "value_mse_loss_layer_000": 0.001045, "value_mse_loss_layer_001": 0.002853, "value_mse_loss_layer_002": 0.01123, "value_mse_loss_layer_003": 0.018799, "value_mse_loss_layer_004": 0.016479, "value_mse_loss_layer_005": 0.016846, "value_mse_loss_layer_006": 0.019897, "value_mse_loss_layer_007": 0.021484, "value_mse_loss_layer_008": 0.025513, "value_mse_loss_layer_009": 0.032471, "value_mse_loss_layer_010": 0.027954, "value_mse_loss_layer_011": 0.03125, "value_mse_loss_layer_012": 0.03064, "value_mse_loss_layer_013": 0.033203, "value_mse_loss_layer_014": 0.033691, "value_mse_loss_layer_015": 0.033936, "value_mse_loss_layer_016": 0.030029, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.038818, "value_mse_loss_layer_020": 0.041992, "value_mse_loss_layer_021": 0.0625, "value_mse_loss_layer_022": 0.045654, "value_mse_loss_layer_023": 0.051758, "value_mse_loss_layer_024": 0.055908, "value_mse_loss_layer_025": 0.077637, "value_mse_loss_layer_026": 0.060059, "value_mse_loss_layer_027": 0.07959, "value_mse_loss_layer_028": 0.077148, "value_mse_loss_layer_029": 0.129883, "value_mse_loss_layer_030": 0.106445, "value_mse_loss_layer_031": 0.139648, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.000141, "vq_loss_layer_006": 0.000252, "vq_loss_layer_007": 0.000315, "vq_loss_layer_008": 0.000406, "vq_loss_layer_009": 0.000469, "vq_loss_layer_010": 0.000385, "vq_loss_layer_011": 0.0005, "vq_loss_layer_012": 0.000683, "vq_loss_layer_013": 0.000595, "vq_loss_layer_014": 0.000725, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000645, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.00032, "vq_loss_layer_019": 0.000275, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.001129, "vq_loss_layer_022": 0.000423, "vq_loss_layer_023": 0.000463, "vq_loss_layer_024": 0.00046, "vq_loss_layer_025": 0.000721, "vq_loss_layer_026": 0.000904, "vq_loss_layer_027": 0.000984, "vq_loss_layer_028": 0.001366, "vq_loss_layer_029": 0.002441, "vq_loss_layer_030": 0.003479, "vq_loss_layer_031": 0.009216 }, { "ce_loss": 2.253983, "epoch": 0.00144, "grad_norm": 0.006502675823867321, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.062103, "kv_vq_loss": 0.000892, "learning_rate": 0.0007895906230238124, "loss": 0.062988, "step": 1440, "value_mse_loss_layer_000": 0.001038, "value_mse_loss_layer_001": 0.002853, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.018188, "value_mse_loss_layer_004": 0.017944, "value_mse_loss_layer_005": 0.016602, "value_mse_loss_layer_006": 0.019287, "value_mse_loss_layer_007": 0.021118, "value_mse_loss_layer_008": 0.024658, "value_mse_loss_layer_009": 0.032227, "value_mse_loss_layer_010": 0.028076, "value_mse_loss_layer_011": 0.030884, "value_mse_loss_layer_012": 0.035889, "value_mse_loss_layer_013": 0.032959, "value_mse_loss_layer_014": 0.034424, "value_mse_loss_layer_015": 0.036133, "value_mse_loss_layer_016": 0.032471, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.0354, "value_mse_loss_layer_019": 0.048828, "value_mse_loss_layer_020": 0.044189, "value_mse_loss_layer_021": 0.054932, "value_mse_loss_layer_022": 0.047363, "value_mse_loss_layer_023": 0.054688, "value_mse_loss_layer_024": 0.071289, "value_mse_loss_layer_025": 0.079102, "value_mse_loss_layer_026": 0.060303, "value_mse_loss_layer_027": 0.080566, "value_mse_loss_layer_028": 0.079102, "value_mse_loss_layer_029": 0.123535, "value_mse_loss_layer_030": 0.116699, "value_mse_loss_layer_031": 0.143555, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.000189, "vq_loss_layer_005": 0.00014, "vq_loss_layer_006": 0.000217, "vq_loss_layer_007": 0.000296, "vq_loss_layer_008": 0.000299, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000338, "vq_loss_layer_011": 0.000395, "vq_loss_layer_012": 0.000969, "vq_loss_layer_013": 0.000538, "vq_loss_layer_014": 0.000618, "vq_loss_layer_015": 0.000652, "vq_loss_layer_016": 0.000671, "vq_loss_layer_017": 0.000771, "vq_loss_layer_018": 0.000353, "vq_loss_layer_019": 0.000383, "vq_loss_layer_020": 0.000395, "vq_loss_layer_021": 0.000721, "vq_loss_layer_022": 0.00037, "vq_loss_layer_023": 0.000473, "vq_loss_layer_024": 0.000507, "vq_loss_layer_025": 0.000568, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.000828, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.002289, "vq_loss_layer_030": 0.003311, "vq_loss_layer_031": 0.008972 }, { "ce_loss": 2.28379, "epoch": 0.00145, "grad_norm": 0.006327849812805653, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.062387, "kv_vq_loss": 0.000915, "learning_rate": 0.0007903420005587436, "loss": 0.063293, "step": 1450, "value_mse_loss_layer_000": 0.001053, "value_mse_loss_layer_001": 0.002808, "value_mse_loss_layer_002": 0.011536, "value_mse_loss_layer_003": 0.018433, "value_mse_loss_layer_004": 0.01709, "value_mse_loss_layer_005": 0.016968, "value_mse_loss_layer_006": 0.02124, "value_mse_loss_layer_007": 0.022705, "value_mse_loss_layer_008": 0.025635, "value_mse_loss_layer_009": 0.034668, "value_mse_loss_layer_010": 0.030884, "value_mse_loss_layer_011": 0.031982, "value_mse_loss_layer_012": 0.032959, "value_mse_loss_layer_013": 0.034912, "value_mse_loss_layer_014": 0.035889, "value_mse_loss_layer_015": 0.037598, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.036621, "value_mse_loss_layer_018": 0.035156, "value_mse_loss_layer_019": 0.040527, "value_mse_loss_layer_020": 0.043213, "value_mse_loss_layer_021": 0.050537, "value_mse_loss_layer_022": 0.044678, "value_mse_loss_layer_023": 0.054443, "value_mse_loss_layer_024": 0.05835, "value_mse_loss_layer_025": 0.085449, "value_mse_loss_layer_026": 0.066406, "value_mse_loss_layer_027": 0.07959, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.119629, "value_mse_loss_layer_030": 0.104492, "value_mse_loss_layer_031": 0.133789, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 0.000114, "vq_loss_layer_005": 0.000136, "vq_loss_layer_006": 0.000324, "vq_loss_layer_007": 0.000328, "vq_loss_layer_008": 0.000305, "vq_loss_layer_009": 0.000448, "vq_loss_layer_010": 0.000429, "vq_loss_layer_011": 0.000427, "vq_loss_layer_012": 0.000694, "vq_loss_layer_013": 0.000549, "vq_loss_layer_014": 0.000736, "vq_loss_layer_015": 0.000839, "vq_loss_layer_016": 0.000648, "vq_loss_layer_017": 0.000557, "vq_loss_layer_018": 0.000347, "vq_loss_layer_019": 0.000322, "vq_loss_layer_020": 0.000366, "vq_loss_layer_021": 0.000713, "vq_loss_layer_022": 0.000393, "vq_loss_layer_023": 0.000607, "vq_loss_layer_024": 0.000546, "vq_loss_layer_025": 0.000847, "vq_loss_layer_026": 0.001198, "vq_loss_layer_027": 0.001175, "vq_loss_layer_028": 0.001343, "vq_loss_layer_029": 0.002869, "vq_loss_layer_030": 0.003418, "vq_loss_layer_031": 0.009888 }, { "ce_loss": 2.291989, "epoch": 0.00146, "grad_norm": 0.004803039599210024, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.061865, "kv_vq_loss": 0.000886, "learning_rate": 0.0007910882139461092, "loss": 0.06275, "step": 1460, "value_mse_loss_layer_000": 0.00103, "value_mse_loss_layer_001": 0.002853, "value_mse_loss_layer_002": 0.010986, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.015991, "value_mse_loss_layer_005": 0.016602, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.021606, "value_mse_loss_layer_008": 0.025757, "value_mse_loss_layer_009": 0.032715, "value_mse_loss_layer_010": 0.028076, "value_mse_loss_layer_011": 0.031738, "value_mse_loss_layer_012": 0.031738, "value_mse_loss_layer_013": 0.032715, "value_mse_loss_layer_014": 0.034912, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.032715, "value_mse_loss_layer_017": 0.035889, "value_mse_loss_layer_018": 0.036133, "value_mse_loss_layer_019": 0.040527, "value_mse_loss_layer_020": 0.049561, "value_mse_loss_layer_021": 0.051758, "value_mse_loss_layer_022": 0.048584, "value_mse_loss_layer_023": 0.05835, "value_mse_loss_layer_024": 0.063965, "value_mse_loss_layer_025": 0.086426, "value_mse_loss_layer_026": 0.068359, "value_mse_loss_layer_027": 0.090332, "value_mse_loss_layer_028": 0.088867, "value_mse_loss_layer_029": 0.135742, "value_mse_loss_layer_030": 0.113281, "value_mse_loss_layer_031": 0.145508, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 9.7e-05, "vq_loss_layer_005": 0.000128, "vq_loss_layer_006": 0.000192, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000328, "vq_loss_layer_009": 0.000376, "vq_loss_layer_010": 0.000305, "vq_loss_layer_011": 0.000423, "vq_loss_layer_012": 0.000618, "vq_loss_layer_013": 0.000483, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000603, "vq_loss_layer_016": 0.000591, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000381, "vq_loss_layer_019": 0.000269, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000458, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.002106, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.00769 }, { "ce_loss": 2.281844, "epoch": 0.00147, "grad_norm": 0.005127807147800922, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.061829, "kv_vq_loss": 0.00086, "learning_rate": 0.0007918293336870439, "loss": 0.062698, "step": 1470, "value_mse_loss_layer_000": 0.00103, "value_mse_loss_layer_001": 0.002838, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.021118, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.015564, "value_mse_loss_layer_006": 0.019409, "value_mse_loss_layer_007": 0.021729, "value_mse_loss_layer_008": 0.024292, "value_mse_loss_layer_009": 0.031494, "value_mse_loss_layer_010": 0.029785, "value_mse_loss_layer_011": 0.029297, "value_mse_loss_layer_012": 0.029419, "value_mse_loss_layer_013": 0.031128, "value_mse_loss_layer_014": 0.032227, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.029663, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.039551, "value_mse_loss_layer_019": 0.038818, "value_mse_loss_layer_020": 0.041016, "value_mse_loss_layer_021": 0.052246, "value_mse_loss_layer_022": 0.047363, "value_mse_loss_layer_023": 0.056152, "value_mse_loss_layer_024": 0.059082, "value_mse_loss_layer_025": 0.079102, "value_mse_loss_layer_026": 0.058838, "value_mse_loss_layer_027": 0.078613, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.116699, "value_mse_loss_layer_030": 0.103027, "value_mse_loss_layer_031": 0.136719, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000116, "vq_loss_layer_006": 0.000244, "vq_loss_layer_007": 0.000401, "vq_loss_layer_008": 0.000305, "vq_loss_layer_009": 0.000395, "vq_loss_layer_010": 0.00037, "vq_loss_layer_011": 0.00037, "vq_loss_layer_012": 0.00066, "vq_loss_layer_013": 0.00058, "vq_loss_layer_014": 0.000652, "vq_loss_layer_015": 0.000607, "vq_loss_layer_016": 0.000645, "vq_loss_layer_017": 0.00058, "vq_loss_layer_018": 0.000404, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000664, "vq_loss_layer_022": 0.00038, "vq_loss_layer_023": 0.000471, "vq_loss_layer_024": 0.000425, "vq_loss_layer_025": 0.000568, "vq_loss_layer_026": 0.000786, "vq_loss_layer_027": 0.000896, "vq_loss_layer_028": 0.00116, "vq_loss_layer_029": 0.002747, "vq_loss_layer_030": 0.003647, "vq_loss_layer_031": 0.009399 }, { "ce_loss": 2.309328, "epoch": 0.00148, "grad_norm": 0.0065884944051504135, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.061838, "kv_vq_loss": 0.000883, "learning_rate": 0.0007925654288487392, "loss": 0.062695, "step": 1480, "value_mse_loss_layer_000": 0.001022, "value_mse_loss_layer_001": 0.002777, "value_mse_loss_layer_002": 0.011108, "value_mse_loss_layer_003": 0.017822, "value_mse_loss_layer_004": 0.016724, "value_mse_loss_layer_005": 0.016846, "value_mse_loss_layer_006": 0.019165, "value_mse_loss_layer_007": 0.021851, "value_mse_loss_layer_008": 0.025269, "value_mse_loss_layer_009": 0.033203, "value_mse_loss_layer_010": 0.029175, "value_mse_loss_layer_011": 0.031128, "value_mse_loss_layer_012": 0.040039, "value_mse_loss_layer_013": 0.032471, "value_mse_loss_layer_014": 0.037354, "value_mse_loss_layer_015": 0.036133, "value_mse_loss_layer_016": 0.032715, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.04126, "value_mse_loss_layer_020": 0.040771, "value_mse_loss_layer_021": 0.05127, "value_mse_loss_layer_022": 0.049072, "value_mse_loss_layer_023": 0.056885, "value_mse_loss_layer_024": 0.070801, "value_mse_loss_layer_025": 0.085449, "value_mse_loss_layer_026": 0.076172, "value_mse_loss_layer_027": 0.080566, "value_mse_loss_layer_028": 0.08252, "value_mse_loss_layer_029": 0.120117, "value_mse_loss_layer_030": 0.107422, "value_mse_loss_layer_031": 0.137695, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 0.00013, "vq_loss_layer_005": 0.000137, "vq_loss_layer_006": 0.000203, "vq_loss_layer_007": 0.00032, "vq_loss_layer_008": 0.000322, "vq_loss_layer_009": 0.000429, "vq_loss_layer_010": 0.00038, "vq_loss_layer_011": 0.000408, "vq_loss_layer_012": 0.001251, "vq_loss_layer_013": 0.000557, "vq_loss_layer_014": 0.000751, "vq_loss_layer_015": 0.000671, "vq_loss_layer_016": 0.000721, "vq_loss_layer_017": 0.000858, "vq_loss_layer_018": 0.000387, "vq_loss_layer_019": 0.000397, "vq_loss_layer_020": 0.000345, "vq_loss_layer_021": 0.00066, "vq_loss_layer_022": 0.000448, "vq_loss_layer_023": 0.000599, "vq_loss_layer_024": 0.000591, "vq_loss_layer_025": 0.000633, "vq_loss_layer_026": 0.001244, "vq_loss_layer_027": 0.00082, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.001984, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.008423 }, { "ce_loss": 2.277507, "epoch": 0.00149, "grad_norm": 0.006486289668828249, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.062436, "kv_vq_loss": 0.000902, "learning_rate": 0.0007932965671030684, "loss": 0.063336, "step": 1490, "value_mse_loss_layer_000": 0.001038, "value_mse_loss_layer_001": 0.002838, "value_mse_loss_layer_002": 0.011169, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.016235, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.02124, "value_mse_loss_layer_008": 0.024414, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.027832, "value_mse_loss_layer_011": 0.030273, "value_mse_loss_layer_012": 0.031738, "value_mse_loss_layer_013": 0.031494, "value_mse_loss_layer_014": 0.034912, "value_mse_loss_layer_015": 0.036133, "value_mse_loss_layer_016": 0.039551, "value_mse_loss_layer_017": 0.034424, "value_mse_loss_layer_018": 0.035645, "value_mse_loss_layer_019": 0.040527, "value_mse_loss_layer_020": 0.043701, "value_mse_loss_layer_021": 0.060303, "value_mse_loss_layer_022": 0.04834, "value_mse_loss_layer_023": 0.062012, "value_mse_loss_layer_024": 0.061523, "value_mse_loss_layer_025": 0.077148, "value_mse_loss_layer_026": 0.06543, "value_mse_loss_layer_027": 0.086914, "value_mse_loss_layer_028": 0.084961, "value_mse_loss_layer_029": 0.130859, "value_mse_loss_layer_030": 0.107422, "value_mse_loss_layer_031": 0.148438, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.000112, "vq_loss_layer_005": 0.000115, "vq_loss_layer_006": 0.000216, "vq_loss_layer_007": 0.000317, "vq_loss_layer_008": 0.000315, "vq_loss_layer_009": 0.00038, "vq_loss_layer_010": 0.00034, "vq_loss_layer_011": 0.000393, "vq_loss_layer_012": 0.000679, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000725, "vq_loss_layer_015": 0.000675, "vq_loss_layer_016": 0.000965, "vq_loss_layer_017": 0.000626, "vq_loss_layer_018": 0.000341, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.00034, "vq_loss_layer_021": 0.000885, "vq_loss_layer_022": 0.000393, "vq_loss_layer_023": 0.000706, "vq_loss_layer_024": 0.000496, "vq_loss_layer_025": 0.000568, "vq_loss_layer_026": 0.000965, "vq_loss_layer_027": 0.001083, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.002548, "vq_loss_layer_030": 0.003128, "vq_loss_layer_031": 0.009094 }, { "ce_loss": 2.277186, "epoch": 0.0015, "grad_norm": 0.0049550472758710384, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.097168, "key_mse_loss_layer_031": 0.086426, "kv_mse_loss": 0.062076, "kv_vq_loss": 0.000887, "learning_rate": 0.0007940228147639202, "loss": 0.062949, "step": 1500, "value_mse_loss_layer_000": 0.001007, "value_mse_loss_layer_001": 0.002808, "value_mse_loss_layer_002": 0.011108, "value_mse_loss_layer_003": 0.017456, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.016235, "value_mse_loss_layer_006": 0.018555, "value_mse_loss_layer_007": 0.020996, "value_mse_loss_layer_008": 0.025513, "value_mse_loss_layer_009": 0.032959, "value_mse_loss_layer_010": 0.030151, "value_mse_loss_layer_011": 0.031738, "value_mse_loss_layer_012": 0.031982, "value_mse_loss_layer_013": 0.032715, "value_mse_loss_layer_014": 0.034668, "value_mse_loss_layer_015": 0.036621, "value_mse_loss_layer_016": 0.033447, "value_mse_loss_layer_017": 0.037109, "value_mse_loss_layer_018": 0.0354, "value_mse_loss_layer_019": 0.040771, "value_mse_loss_layer_020": 0.041504, "value_mse_loss_layer_021": 0.050049, "value_mse_loss_layer_022": 0.047363, "value_mse_loss_layer_023": 0.056885, "value_mse_loss_layer_024": 0.066895, "value_mse_loss_layer_025": 0.080078, "value_mse_loss_layer_026": 0.065918, "value_mse_loss_layer_027": 0.087891, "value_mse_loss_layer_028": 0.08252, "value_mse_loss_layer_029": 0.12207, "value_mse_loss_layer_030": 0.112793, "value_mse_loss_layer_031": 0.137695, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 0.000119, "vq_loss_layer_005": 0.00015, "vq_loss_layer_006": 0.000194, "vq_loss_layer_007": 0.000315, "vq_loss_layer_008": 0.000343, "vq_loss_layer_009": 0.000429, "vq_loss_layer_010": 0.000393, "vq_loss_layer_011": 0.000444, "vq_loss_layer_012": 0.000652, "vq_loss_layer_013": 0.000542, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000675, "vq_loss_layer_016": 0.000698, "vq_loss_layer_017": 0.000877, "vq_loss_layer_018": 0.00038, "vq_loss_layer_019": 0.000303, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.000546, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000444, "vq_loss_layer_024": 0.000561, "vq_loss_layer_025": 0.000576, "vq_loss_layer_026": 0.000839, "vq_loss_layer_027": 0.001175, "vq_loss_layer_028": 0.001381, "vq_loss_layer_029": 0.003006, "vq_loss_layer_030": 0.004242, "vq_loss_layer_031": 0.009277 }, { "ce_loss": 2.282177, "epoch": 0.00151, "grad_norm": 0.006871087476611137, "key_mse_loss_layer_000": 0.003784, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.082031, "key_mse_loss_layer_010": 0.093262, "key_mse_loss_layer_011": 0.092285, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.104492, "key_mse_loss_layer_014": 0.101562, "key_mse_loss_layer_015": 0.091309, "key_mse_loss_layer_016": 0.083984, "key_mse_loss_layer_017": 0.087402, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.06214, "kv_vq_loss": 0.000898, "learning_rate": 0.0007947442368232922, "loss": 0.063031, "step": 1510, "value_mse_loss_layer_000": 0.001007, "value_mse_loss_layer_001": 0.002747, "value_mse_loss_layer_002": 0.011108, "value_mse_loss_layer_003": 0.018066, "value_mse_loss_layer_004": 0.017822, "value_mse_loss_layer_005": 0.016479, "value_mse_loss_layer_006": 0.02063, "value_mse_loss_layer_007": 0.021484, "value_mse_loss_layer_008": 0.024414, "value_mse_loss_layer_009": 0.03064, "value_mse_loss_layer_010": 0.026367, "value_mse_loss_layer_011": 0.029053, "value_mse_loss_layer_012": 0.029541, "value_mse_loss_layer_013": 0.029785, "value_mse_loss_layer_014": 0.034668, "value_mse_loss_layer_015": 0.033936, "value_mse_loss_layer_016": 0.030396, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.035889, "value_mse_loss_layer_019": 0.039551, "value_mse_loss_layer_020": 0.040771, "value_mse_loss_layer_021": 0.056396, "value_mse_loss_layer_022": 0.049316, "value_mse_loss_layer_023": 0.054932, "value_mse_loss_layer_024": 0.067871, "value_mse_loss_layer_025": 0.091309, "value_mse_loss_layer_026": 0.067383, "value_mse_loss_layer_027": 0.091309, "value_mse_loss_layer_028": 0.087891, "value_mse_loss_layer_029": 0.146484, "value_mse_loss_layer_030": 0.120117, "value_mse_loss_layer_031": 0.157227, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 2.7e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 7e-05, "vq_loss_layer_004": 0.000142, "vq_loss_layer_005": 0.000128, "vq_loss_layer_006": 0.000286, "vq_loss_layer_007": 0.00029, "vq_loss_layer_008": 0.000345, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000385, "vq_loss_layer_011": 0.000439, "vq_loss_layer_012": 0.000607, "vq_loss_layer_013": 0.0005, "vq_loss_layer_014": 0.000679, "vq_loss_layer_015": 0.000732, "vq_loss_layer_016": 0.000641, "vq_loss_layer_017": 0.000486, "vq_loss_layer_018": 0.000454, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000923, "vq_loss_layer_022": 0.000446, "vq_loss_layer_023": 0.000389, "vq_loss_layer_024": 0.000511, "vq_loss_layer_025": 0.000851, "vq_loss_layer_026": 0.000832, "vq_loss_layer_027": 0.001015, "vq_loss_layer_028": 0.001694, "vq_loss_layer_029": 0.003281, "vq_loss_layer_030": 0.004272, "vq_loss_layer_031": 0.013062 }, { "ce_loss": 2.259046, "epoch": 0.00152, "grad_norm": 0.004917917773127556, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.061703, "kv_vq_loss": 0.0009, "learning_rate": 0.000795460896986193, "loss": 0.062598, "step": 1520, "value_mse_loss_layer_000": 0.00103, "value_mse_loss_layer_001": 0.002792, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.01532, "value_mse_loss_layer_005": 0.0177, "value_mse_loss_layer_006": 0.019531, "value_mse_loss_layer_007": 0.020752, "value_mse_loss_layer_008": 0.024536, "value_mse_loss_layer_009": 0.033447, "value_mse_loss_layer_010": 0.027344, "value_mse_loss_layer_011": 0.029663, "value_mse_loss_layer_012": 0.030151, "value_mse_loss_layer_013": 0.031982, "value_mse_loss_layer_014": 0.033447, "value_mse_loss_layer_015": 0.0354, "value_mse_loss_layer_016": 0.030762, "value_mse_loss_layer_017": 0.034424, "value_mse_loss_layer_018": 0.034668, "value_mse_loss_layer_019": 0.038818, "value_mse_loss_layer_020": 0.039795, "value_mse_loss_layer_021": 0.049316, "value_mse_loss_layer_022": 0.046143, "value_mse_loss_layer_023": 0.063477, "value_mse_loss_layer_024": 0.057373, "value_mse_loss_layer_025": 0.070312, "value_mse_loss_layer_026": 0.057617, "value_mse_loss_layer_027": 0.076172, "value_mse_loss_layer_028": 0.075684, "value_mse_loss_layer_029": 0.11377, "value_mse_loss_layer_030": 0.106445, "value_mse_loss_layer_031": 0.135742, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000165, "vq_loss_layer_006": 0.000235, "vq_loss_layer_007": 0.00028, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000467, "vq_loss_layer_010": 0.000298, "vq_loss_layer_011": 0.000364, "vq_loss_layer_012": 0.000603, "vq_loss_layer_013": 0.000526, "vq_loss_layer_014": 0.000576, "vq_loss_layer_015": 0.000607, "vq_loss_layer_016": 0.000595, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.000286, "vq_loss_layer_021": 0.000626, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000584, "vq_loss_layer_024": 0.00038, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000675, "vq_loss_layer_027": 0.000729, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001831, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.007812 }, { "ce_loss": 2.301005, "epoch": 0.00153, "grad_norm": 0.005960928741842508, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.061768, "kv_vq_loss": 0.000884, "learning_rate": 0.0007961728577043996, "loss": 0.062637, "step": 1530, "value_mse_loss_layer_000": 0.001007, "value_mse_loss_layer_001": 0.002716, "value_mse_loss_layer_002": 0.011047, "value_mse_loss_layer_003": 0.017334, "value_mse_loss_layer_004": 0.017456, "value_mse_loss_layer_005": 0.016602, "value_mse_loss_layer_006": 0.018677, "value_mse_loss_layer_007": 0.021729, "value_mse_loss_layer_008": 0.025024, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.029419, "value_mse_loss_layer_011": 0.030518, "value_mse_loss_layer_012": 0.03125, "value_mse_loss_layer_013": 0.030396, "value_mse_loss_layer_014": 0.032715, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.028198, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.031128, "value_mse_loss_layer_019": 0.041992, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.058838, "value_mse_loss_layer_022": 0.042236, "value_mse_loss_layer_023": 0.048828, "value_mse_loss_layer_024": 0.053711, "value_mse_loss_layer_025": 0.067871, "value_mse_loss_layer_026": 0.057617, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.072754, "value_mse_loss_layer_029": 0.116699, "value_mse_loss_layer_030": 0.100586, "value_mse_loss_layer_031": 0.134766, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 4.9e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 0.000151, "vq_loss_layer_006": 0.000197, "vq_loss_layer_007": 0.000326, "vq_loss_layer_008": 0.000399, "vq_loss_layer_009": 0.000423, "vq_loss_layer_010": 0.000401, "vq_loss_layer_011": 0.000458, "vq_loss_layer_012": 0.000778, "vq_loss_layer_013": 0.000462, "vq_loss_layer_014": 0.000683, "vq_loss_layer_015": 0.000565, "vq_loss_layer_016": 0.000595, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000307, "vq_loss_layer_019": 0.000462, "vq_loss_layer_020": 0.000353, "vq_loss_layer_021": 0.001068, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000523, "vq_loss_layer_024": 0.000479, "vq_loss_layer_025": 0.000702, "vq_loss_layer_026": 0.001022, "vq_loss_layer_027": 0.001106, "vq_loss_layer_028": 0.001495, "vq_loss_layer_029": 0.003235, "vq_loss_layer_030": 0.003372, "vq_loss_layer_031": 0.00946 }, { "ce_loss": 2.300275, "epoch": 0.00154, "grad_norm": 0.006399385165423155, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.062234, "kv_vq_loss": 0.000897, "learning_rate": 0.0007968801802091157, "loss": 0.06312, "step": 1540, "value_mse_loss_layer_000": 0.001007, "value_mse_loss_layer_001": 0.002792, "value_mse_loss_layer_002": 0.011108, "value_mse_loss_layer_003": 0.020508, "value_mse_loss_layer_004": 0.015991, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.018433, "value_mse_loss_layer_007": 0.020996, "value_mse_loss_layer_008": 0.025513, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.027588, "value_mse_loss_layer_011": 0.029907, "value_mse_loss_layer_012": 0.030762, "value_mse_loss_layer_013": 0.033203, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.035645, "value_mse_loss_layer_016": 0.031494, "value_mse_loss_layer_017": 0.034668, "value_mse_loss_layer_018": 0.035156, "value_mse_loss_layer_019": 0.040283, "value_mse_loss_layer_020": 0.041504, "value_mse_loss_layer_021": 0.052246, "value_mse_loss_layer_022": 0.049805, "value_mse_loss_layer_023": 0.053711, "value_mse_loss_layer_024": 0.056885, "value_mse_loss_layer_025": 0.076172, "value_mse_loss_layer_026": 0.060791, "value_mse_loss_layer_027": 0.079102, "value_mse_loss_layer_028": 0.084961, "value_mse_loss_layer_029": 0.113281, "value_mse_loss_layer_030": 0.105469, "value_mse_loss_layer_031": 0.145508, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000186, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000353, "vq_loss_layer_009": 0.000364, "vq_loss_layer_010": 0.000336, "vq_loss_layer_011": 0.000372, "vq_loss_layer_012": 0.000626, "vq_loss_layer_013": 0.00058, "vq_loss_layer_014": 0.000591, "vq_loss_layer_015": 0.000633, "vq_loss_layer_016": 0.000626, "vq_loss_layer_017": 0.000572, "vq_loss_layer_018": 0.00033, "vq_loss_layer_019": 0.000299, "vq_loss_layer_020": 0.000338, "vq_loss_layer_021": 0.000725, "vq_loss_layer_022": 0.000408, "vq_loss_layer_023": 0.000458, "vq_loss_layer_024": 0.000378, "vq_loss_layer_025": 0.000507, "vq_loss_layer_026": 0.000767, "vq_loss_layer_027": 0.000744, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.001892, "vq_loss_layer_030": 0.003021, "vq_loss_layer_031": 0.008423 }, { "ce_loss": 2.286431, "epoch": 0.00155, "grad_norm": 0.004809876903891563, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.062036, "kv_vq_loss": 0.000889, "learning_rate": 0.0007975829245425727, "loss": 0.062918, "step": 1550, "value_mse_loss_layer_000": 0.001007, "value_mse_loss_layer_001": 0.002747, "value_mse_loss_layer_002": 0.01123, "value_mse_loss_layer_003": 0.018066, "value_mse_loss_layer_004": 0.015869, "value_mse_loss_layer_005": 0.016113, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.020996, "value_mse_loss_layer_008": 0.024048, "value_mse_loss_layer_009": 0.03125, "value_mse_loss_layer_010": 0.028076, "value_mse_loss_layer_011": 0.028931, "value_mse_loss_layer_012": 0.029785, "value_mse_loss_layer_013": 0.031738, "value_mse_loss_layer_014": 0.033691, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.03125, "value_mse_loss_layer_017": 0.03418, "value_mse_loss_layer_018": 0.033936, "value_mse_loss_layer_019": 0.039307, "value_mse_loss_layer_020": 0.042969, "value_mse_loss_layer_021": 0.049561, "value_mse_loss_layer_022": 0.044434, "value_mse_loss_layer_023": 0.051758, "value_mse_loss_layer_024": 0.054443, "value_mse_loss_layer_025": 0.07373, "value_mse_loss_layer_026": 0.055664, "value_mse_loss_layer_027": 0.075684, "value_mse_loss_layer_028": 0.080566, "value_mse_loss_layer_029": 0.113281, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.139648, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 0.000116, "vq_loss_layer_006": 0.000236, "vq_loss_layer_007": 0.000296, "vq_loss_layer_008": 0.000307, "vq_loss_layer_009": 0.000383, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000364, "vq_loss_layer_012": 0.000622, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000648, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000671, "vq_loss_layer_017": 0.000591, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.00025, "vq_loss_layer_020": 0.000399, "vq_loss_layer_021": 0.000698, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.00042, "vq_loss_layer_024": 0.000359, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.000774, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.001816, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.008545 }, { "ce_loss": 2.278836, "epoch": 0.00156, "grad_norm": 0.005868937354534864, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.06235, "kv_vq_loss": 0.000898, "learning_rate": 0.0007982811495886153, "loss": 0.063245, "step": 1560, "value_mse_loss_layer_000": 0.001015, "value_mse_loss_layer_001": 0.002777, "value_mse_loss_layer_002": 0.010986, "value_mse_loss_layer_003": 0.017822, "value_mse_loss_layer_004": 0.02002, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.018433, "value_mse_loss_layer_007": 0.022217, "value_mse_loss_layer_008": 0.025146, "value_mse_loss_layer_009": 0.032471, "value_mse_loss_layer_010": 0.028076, "value_mse_loss_layer_011": 0.03125, "value_mse_loss_layer_012": 0.03125, "value_mse_loss_layer_013": 0.032227, "value_mse_loss_layer_014": 0.036133, "value_mse_loss_layer_015": 0.035889, "value_mse_loss_layer_016": 0.032227, "value_mse_loss_layer_017": 0.034912, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.040283, "value_mse_loss_layer_020": 0.042236, "value_mse_loss_layer_021": 0.055908, "value_mse_loss_layer_022": 0.046875, "value_mse_loss_layer_023": 0.060303, "value_mse_loss_layer_024": 0.061035, "value_mse_loss_layer_025": 0.082031, "value_mse_loss_layer_026": 0.064941, "value_mse_loss_layer_027": 0.091797, "value_mse_loss_layer_028": 0.081055, "value_mse_loss_layer_029": 0.133789, "value_mse_loss_layer_030": 0.112305, "value_mse_loss_layer_031": 0.137695, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 0.000221, "vq_loss_layer_005": 0.000121, "vq_loss_layer_006": 0.000195, "vq_loss_layer_007": 0.000395, "vq_loss_layer_008": 0.000317, "vq_loss_layer_009": 0.000404, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000456, "vq_loss_layer_012": 0.000713, "vq_loss_layer_013": 0.000542, "vq_loss_layer_014": 0.000645, "vq_loss_layer_015": 0.000687, "vq_loss_layer_016": 0.000675, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000322, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.000702, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000483, "vq_loss_layer_024": 0.000404, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000835, "vq_loss_layer_027": 0.000992, "vq_loss_layer_028": 0.000977, "vq_loss_layer_029": 0.002487, "vq_loss_layer_030": 0.00322, "vq_loss_layer_031": 0.007721 }, { "ce_loss": 2.262247, "epoch": 0.00157, "grad_norm": 0.004794728010892868, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.11084, "key_mse_loss_layer_017": 0.11084, "key_mse_loss_layer_018": 0.120117, "key_mse_loss_layer_019": 0.100098, "key_mse_loss_layer_020": 0.11377, "key_mse_loss_layer_021": 0.10498, "key_mse_loss_layer_022": 0.111328, "key_mse_loss_layer_023": 0.115723, "key_mse_loss_layer_024": 0.09082, "key_mse_loss_layer_025": 0.086914, "key_mse_loss_layer_026": 0.103027, "key_mse_loss_layer_027": 0.099609, "key_mse_loss_layer_028": 0.108398, "key_mse_loss_layer_029": 0.099121, "key_mse_loss_layer_030": 0.100586, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.061743, "kv_vq_loss": 0.000893, "learning_rate": 0.0007989749131023083, "loss": 0.062628, "step": 1570, "value_mse_loss_layer_000": 0.000942, "value_mse_loss_layer_001": 0.00267, "value_mse_loss_layer_002": 0.011047, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.019165, "value_mse_loss_layer_005": 0.016479, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.021851, "value_mse_loss_layer_008": 0.025391, "value_mse_loss_layer_009": 0.032471, "value_mse_loss_layer_010": 0.0271, "value_mse_loss_layer_011": 0.028442, "value_mse_loss_layer_012": 0.031006, "value_mse_loss_layer_013": 0.030884, "value_mse_loss_layer_014": 0.032715, "value_mse_loss_layer_015": 0.030762, "value_mse_loss_layer_016": 0.030273, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.036621, "value_mse_loss_layer_019": 0.041504, "value_mse_loss_layer_020": 0.045166, "value_mse_loss_layer_021": 0.050049, "value_mse_loss_layer_022": 0.047363, "value_mse_loss_layer_023": 0.056641, "value_mse_loss_layer_024": 0.063477, "value_mse_loss_layer_025": 0.076172, "value_mse_loss_layer_026": 0.065918, "value_mse_loss_layer_027": 0.094238, "value_mse_loss_layer_028": 0.087891, "value_mse_loss_layer_029": 0.128906, "value_mse_loss_layer_030": 0.119629, "value_mse_loss_layer_031": 0.165039, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 2.8e-05, "vq_loss_layer_002": 3.4e-05, "vq_loss_layer_003": 6.6e-05, "vq_loss_layer_004": 0.000167, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000216, "vq_loss_layer_007": 0.000317, "vq_loss_layer_008": 0.000423, "vq_loss_layer_009": 0.000484, "vq_loss_layer_010": 0.00038, "vq_loss_layer_011": 0.000389, "vq_loss_layer_012": 0.000809, "vq_loss_layer_013": 0.0005, "vq_loss_layer_014": 0.000751, "vq_loss_layer_015": 0.000645, "vq_loss_layer_016": 0.000641, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000412, "vq_loss_layer_019": 0.000315, "vq_loss_layer_020": 0.000378, "vq_loss_layer_021": 0.000595, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000416, "vq_loss_layer_024": 0.000399, "vq_loss_layer_025": 0.000656, "vq_loss_layer_026": 0.000984, "vq_loss_layer_027": 0.001297, "vq_loss_layer_028": 0.001968, "vq_loss_layer_029": 0.003296, "vq_loss_layer_030": 0.003677, "vq_loss_layer_031": 0.013245 }, { "ce_loss": 2.311567, "epoch": 0.00158, "grad_norm": 0.00754220224916935, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.110352, "key_mse_loss_layer_018": 0.116211, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.061575, "kv_vq_loss": 0.000872, "learning_rate": 0.0007996642717386055, "loss": 0.062445, "step": 1580, "value_mse_loss_layer_000": 0.000984, "value_mse_loss_layer_001": 0.002747, "value_mse_loss_layer_002": 0.011353, "value_mse_loss_layer_003": 0.018066, "value_mse_loss_layer_004": 0.016235, "value_mse_loss_layer_005": 0.01709, "value_mse_loss_layer_006": 0.019653, "value_mse_loss_layer_007": 0.021851, "value_mse_loss_layer_008": 0.024536, "value_mse_loss_layer_009": 0.031494, "value_mse_loss_layer_010": 0.028809, "value_mse_loss_layer_011": 0.03064, "value_mse_loss_layer_012": 0.030396, "value_mse_loss_layer_013": 0.032715, "value_mse_loss_layer_014": 0.033936, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.030762, "value_mse_loss_layer_017": 0.034912, "value_mse_loss_layer_018": 0.03418, "value_mse_loss_layer_019": 0.037842, "value_mse_loss_layer_020": 0.041748, "value_mse_loss_layer_021": 0.047852, "value_mse_loss_layer_022": 0.044678, "value_mse_loss_layer_023": 0.055176, "value_mse_loss_layer_024": 0.060059, "value_mse_loss_layer_025": 0.070801, "value_mse_loss_layer_026": 0.078613, "value_mse_loss_layer_027": 0.078613, "value_mse_loss_layer_028": 0.099121, "value_mse_loss_layer_029": 0.117676, "value_mse_loss_layer_030": 0.115723, "value_mse_loss_layer_031": 0.147461, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 0.000106, "vq_loss_layer_005": 0.00017, "vq_loss_layer_006": 0.00028, "vq_loss_layer_007": 0.000343, "vq_loss_layer_008": 0.000355, "vq_loss_layer_009": 0.000408, "vq_loss_layer_010": 0.000416, "vq_loss_layer_011": 0.000458, "vq_loss_layer_012": 0.000656, "vq_loss_layer_013": 0.000557, "vq_loss_layer_014": 0.000713, "vq_loss_layer_015": 0.000702, "vq_loss_layer_016": 0.000698, "vq_loss_layer_017": 0.000725, "vq_loss_layer_018": 0.000402, "vq_loss_layer_019": 0.000303, "vq_loss_layer_020": 0.000423, "vq_loss_layer_021": 0.000694, "vq_loss_layer_022": 0.000462, "vq_loss_layer_023": 0.000652, "vq_loss_layer_024": 0.000546, "vq_loss_layer_025": 0.000648, "vq_loss_layer_026": 0.001778, "vq_loss_layer_027": 0.000935, "vq_loss_layer_028": 0.002777, "vq_loss_layer_029": 0.00209, "vq_loss_layer_030": 0.003601, "vq_loss_layer_031": 0.010254 }, { "ce_loss": 2.307013, "epoch": 0.00159, "grad_norm": 0.005134656094014645, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.061279, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.061313, "kv_vq_loss": 0.000862, "learning_rate": 0.0008003492810801127, "loss": 0.062161, "step": 1590, "value_mse_loss_layer_000": 0.001022, "value_mse_loss_layer_001": 0.002777, "value_mse_loss_layer_002": 0.010742, "value_mse_loss_layer_003": 0.016846, "value_mse_loss_layer_004": 0.015198, "value_mse_loss_layer_005": 0.015137, "value_mse_loss_layer_006": 0.018188, "value_mse_loss_layer_007": 0.02063, "value_mse_loss_layer_008": 0.02478, "value_mse_loss_layer_009": 0.033447, "value_mse_loss_layer_010": 0.027466, "value_mse_loss_layer_011": 0.030396, "value_mse_loss_layer_012": 0.029663, "value_mse_loss_layer_013": 0.0354, "value_mse_loss_layer_014": 0.036377, "value_mse_loss_layer_015": 0.0354, "value_mse_loss_layer_016": 0.030884, "value_mse_loss_layer_017": 0.03418, "value_mse_loss_layer_018": 0.033936, "value_mse_loss_layer_019": 0.040039, "value_mse_loss_layer_020": 0.040283, "value_mse_loss_layer_021": 0.052246, "value_mse_loss_layer_022": 0.047363, "value_mse_loss_layer_023": 0.054688, "value_mse_loss_layer_024": 0.05542, "value_mse_loss_layer_025": 0.071289, "value_mse_loss_layer_026": 0.060303, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.080566, "value_mse_loss_layer_029": 0.114746, "value_mse_loss_layer_030": 0.107422, "value_mse_loss_layer_031": 0.126953, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000194, "vq_loss_layer_007": 0.000315, "vq_loss_layer_008": 0.000315, "vq_loss_layer_009": 0.000477, "vq_loss_layer_010": 0.000313, "vq_loss_layer_011": 0.000401, "vq_loss_layer_012": 0.000599, "vq_loss_layer_013": 0.000706, "vq_loss_layer_014": 0.000721, "vq_loss_layer_015": 0.000618, "vq_loss_layer_016": 0.000599, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.000313, "vq_loss_layer_019": 0.00029, "vq_loss_layer_020": 0.000311, "vq_loss_layer_021": 0.000626, "vq_loss_layer_022": 0.00038, "vq_loss_layer_023": 0.000481, "vq_loss_layer_024": 0.000378, "vq_loss_layer_025": 0.000492, "vq_loss_layer_026": 0.000793, "vq_loss_layer_027": 0.000824, "vq_loss_layer_028": 0.001312, "vq_loss_layer_029": 0.002823, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.008606 }, { "ce_loss": 2.308801, "epoch": 0.0016, "grad_norm": 0.0044905501417815685, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.061768, "kv_vq_loss": 0.000871, "learning_rate": 0.000801029995663981, "loss": 0.062628, "step": 1600, "value_mse_loss_layer_000": 0.000999, "value_mse_loss_layer_001": 0.002808, "value_mse_loss_layer_002": 0.010864, "value_mse_loss_layer_003": 0.0177, "value_mse_loss_layer_004": 0.015869, "value_mse_loss_layer_005": 0.018677, "value_mse_loss_layer_006": 0.019897, "value_mse_loss_layer_007": 0.02124, "value_mse_loss_layer_008": 0.025146, "value_mse_loss_layer_009": 0.032959, "value_mse_loss_layer_010": 0.028564, "value_mse_loss_layer_011": 0.031006, "value_mse_loss_layer_012": 0.032959, "value_mse_loss_layer_013": 0.032715, "value_mse_loss_layer_014": 0.034424, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.031982, "value_mse_loss_layer_017": 0.037354, "value_mse_loss_layer_018": 0.032715, "value_mse_loss_layer_019": 0.038574, "value_mse_loss_layer_020": 0.041504, "value_mse_loss_layer_021": 0.049805, "value_mse_loss_layer_022": 0.045654, "value_mse_loss_layer_023": 0.055176, "value_mse_loss_layer_024": 0.055176, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.060547, "value_mse_loss_layer_027": 0.088379, "value_mse_loss_layer_028": 0.078613, "value_mse_loss_layer_029": 0.124023, "value_mse_loss_layer_030": 0.103516, "value_mse_loss_layer_031": 0.137695, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 9.7e-05, "vq_loss_layer_005": 0.000181, "vq_loss_layer_006": 0.00028, "vq_loss_layer_007": 0.000307, "vq_loss_layer_008": 0.00036, "vq_loss_layer_009": 0.000433, "vq_loss_layer_010": 0.00038, "vq_loss_layer_011": 0.000418, "vq_loss_layer_012": 0.000786, "vq_loss_layer_013": 0.000553, "vq_loss_layer_014": 0.000656, "vq_loss_layer_015": 0.000675, "vq_loss_layer_016": 0.000675, "vq_loss_layer_017": 0.000839, "vq_loss_layer_018": 0.000349, "vq_loss_layer_019": 0.000286, "vq_loss_layer_020": 0.000383, "vq_loss_layer_021": 0.000713, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000511, "vq_loss_layer_024": 0.000401, "vq_loss_layer_025": 0.000591, "vq_loss_layer_026": 0.00079, "vq_loss_layer_027": 0.001244, "vq_loss_layer_028": 0.001205, "vq_loss_layer_029": 0.002243, "vq_loss_layer_030": 0.003128, "vq_loss_layer_031": 0.008484 }, { "ce_loss": 2.259461, "epoch": 0.00161, "grad_norm": 0.005143162794411182, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.061948, "kv_vq_loss": 0.000898, "learning_rate": 0.0008017064690079624, "loss": 0.062854, "step": 1610, "value_mse_loss_layer_000": 0.000999, "value_mse_loss_layer_001": 0.002762, "value_mse_loss_layer_002": 0.01123, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.015747, "value_mse_loss_layer_005": 0.015503, "value_mse_loss_layer_006": 0.018433, "value_mse_loss_layer_007": 0.02124, "value_mse_loss_layer_008": 0.024536, "value_mse_loss_layer_009": 0.031494, "value_mse_loss_layer_010": 0.028076, "value_mse_loss_layer_011": 0.03064, "value_mse_loss_layer_012": 0.028931, "value_mse_loss_layer_013": 0.030518, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.033936, "value_mse_loss_layer_016": 0.031006, "value_mse_loss_layer_017": 0.03418, "value_mse_loss_layer_018": 0.033936, "value_mse_loss_layer_019": 0.04126, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.048584, "value_mse_loss_layer_022": 0.047119, "value_mse_loss_layer_023": 0.059326, "value_mse_loss_layer_024": 0.060547, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.05835, "value_mse_loss_layer_027": 0.088867, "value_mse_loss_layer_028": 0.07666, "value_mse_loss_layer_029": 0.123535, "value_mse_loss_layer_030": 0.110352, "value_mse_loss_layer_031": 0.136719, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 0.00011, "vq_loss_layer_005": 0.000114, "vq_loss_layer_006": 0.000186, "vq_loss_layer_007": 0.000319, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.00038, "vq_loss_layer_010": 0.000359, "vq_loss_layer_011": 0.000469, "vq_loss_layer_012": 0.000584, "vq_loss_layer_013": 0.000452, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000648, "vq_loss_layer_017": 0.000568, "vq_loss_layer_018": 0.000341, "vq_loss_layer_019": 0.000298, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.000351, "vq_loss_layer_023": 0.000492, "vq_loss_layer_024": 0.000439, "vq_loss_layer_025": 0.000446, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000992, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.00193, "vq_loss_layer_030": 0.003021, "vq_loss_layer_031": 0.007599 }, { "ce_loss": 2.311555, "epoch": 0.00162, "grad_norm": 0.005709671415388584, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.061572, "kv_vq_loss": 0.000853, "learning_rate": 0.0008023787536356576, "loss": 0.062411, "step": 1620, "value_mse_loss_layer_000": 0.000992, "value_mse_loss_layer_001": 0.002701, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.017212, "value_mse_loss_layer_004": 0.016113, "value_mse_loss_layer_005": 0.015991, "value_mse_loss_layer_006": 0.018188, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.023682, "value_mse_loss_layer_009": 0.031128, "value_mse_loss_layer_010": 0.026855, "value_mse_loss_layer_011": 0.028687, "value_mse_loss_layer_012": 0.031006, "value_mse_loss_layer_013": 0.031128, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.033936, "value_mse_loss_layer_016": 0.037842, "value_mse_loss_layer_017": 0.034912, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.038086, "value_mse_loss_layer_020": 0.04126, "value_mse_loss_layer_021": 0.049561, "value_mse_loss_layer_022": 0.044189, "value_mse_loss_layer_023": 0.052734, "value_mse_loss_layer_024": 0.059082, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.06543, "value_mse_loss_layer_027": 0.082031, "value_mse_loss_layer_028": 0.07959, "value_mse_loss_layer_029": 0.117676, "value_mse_loss_layer_030": 0.10498, "value_mse_loss_layer_031": 0.139648, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 9.6e-05, "vq_loss_layer_005": 0.00012, "vq_loss_layer_006": 0.000188, "vq_loss_layer_007": 0.000275, "vq_loss_layer_008": 0.000301, "vq_loss_layer_009": 0.000399, "vq_loss_layer_010": 0.000351, "vq_loss_layer_011": 0.000385, "vq_loss_layer_012": 0.000786, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000626, "vq_loss_layer_015": 0.000641, "vq_loss_layer_016": 0.000931, "vq_loss_layer_017": 0.000683, "vq_loss_layer_018": 0.000332, "vq_loss_layer_019": 0.000273, "vq_loss_layer_020": 0.000355, "vq_loss_layer_021": 0.000732, "vq_loss_layer_022": 0.000357, "vq_loss_layer_023": 0.000467, "vq_loss_layer_024": 0.000446, "vq_loss_layer_025": 0.00066, "vq_loss_layer_026": 0.001167, "vq_loss_layer_027": 0.000992, "vq_loss_layer_028": 0.001488, "vq_loss_layer_029": 0.00264, "vq_loss_layer_030": 0.004272, "vq_loss_layer_031": 0.009888 }, { "ce_loss": 2.272155, "epoch": 0.00163, "grad_norm": 0.0058571165427565575, "key_mse_loss_layer_000": 0.004364, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.061496, "kv_vq_loss": 0.000869, "learning_rate": 0.0008030469011009894, "loss": 0.062378, "step": 1630, "value_mse_loss_layer_000": 0.00103, "value_mse_loss_layer_001": 0.002731, "value_mse_loss_layer_002": 0.010864, "value_mse_loss_layer_003": 0.018799, "value_mse_loss_layer_004": 0.015991, "value_mse_loss_layer_005": 0.017456, "value_mse_loss_layer_006": 0.019653, "value_mse_loss_layer_007": 0.021606, "value_mse_loss_layer_008": 0.024536, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.031494, "value_mse_loss_layer_011": 0.030029, "value_mse_loss_layer_012": 0.03064, "value_mse_loss_layer_013": 0.03125, "value_mse_loss_layer_014": 0.03418, "value_mse_loss_layer_015": 0.034424, "value_mse_loss_layer_016": 0.031982, "value_mse_loss_layer_017": 0.033447, "value_mse_loss_layer_018": 0.032715, "value_mse_loss_layer_019": 0.042725, "value_mse_loss_layer_020": 0.04541, "value_mse_loss_layer_021": 0.04834, "value_mse_loss_layer_022": 0.049805, "value_mse_loss_layer_023": 0.058594, "value_mse_loss_layer_024": 0.062988, "value_mse_loss_layer_025": 0.075195, "value_mse_loss_layer_026": 0.060791, "value_mse_loss_layer_027": 0.078613, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.121094, "value_mse_loss_layer_030": 0.102051, "value_mse_loss_layer_031": 0.136719, "vq_loss_layer_000": 1.5e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.000162, "vq_loss_layer_006": 0.000243, "vq_loss_layer_007": 0.000319, "vq_loss_layer_008": 0.000307, "vq_loss_layer_009": 0.000389, "vq_loss_layer_010": 0.000412, "vq_loss_layer_011": 0.000408, "vq_loss_layer_012": 0.000645, "vq_loss_layer_013": 0.000467, "vq_loss_layer_014": 0.000706, "vq_loss_layer_015": 0.000706, "vq_loss_layer_016": 0.000702, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000347, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000355, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000481, "vq_loss_layer_023": 0.000587, "vq_loss_layer_024": 0.000526, "vq_loss_layer_025": 0.000603, "vq_loss_layer_026": 0.000954, "vq_loss_layer_027": 0.000839, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.002441, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.301331, "epoch": 0.00164, "grad_norm": 0.005616513546556234, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.061469, "kv_vq_loss": 0.000875, "learning_rate": 0.0008037109620119242, "loss": 0.062317, "step": 1640, "value_mse_loss_layer_000": 0.000984, "value_mse_loss_layer_001": 0.002716, "value_mse_loss_layer_002": 0.01123, "value_mse_loss_layer_003": 0.018433, "value_mse_loss_layer_004": 0.015991, "value_mse_loss_layer_005": 0.016602, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.02124, "value_mse_loss_layer_008": 0.024658, "value_mse_loss_layer_009": 0.032959, "value_mse_loss_layer_010": 0.028564, "value_mse_loss_layer_011": 0.030396, "value_mse_loss_layer_012": 0.032715, "value_mse_loss_layer_013": 0.032471, "value_mse_loss_layer_014": 0.035889, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.031982, "value_mse_loss_layer_017": 0.036377, "value_mse_loss_layer_018": 0.033447, "value_mse_loss_layer_019": 0.038818, "value_mse_loss_layer_020": 0.04126, "value_mse_loss_layer_021": 0.051514, "value_mse_loss_layer_022": 0.046387, "value_mse_loss_layer_023": 0.051025, "value_mse_loss_layer_024": 0.057373, "value_mse_loss_layer_025": 0.07666, "value_mse_loss_layer_026": 0.056396, "value_mse_loss_layer_027": 0.074707, "value_mse_loss_layer_028": 0.074707, "value_mse_loss_layer_029": 0.112793, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.135742, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 0.000142, "vq_loss_layer_006": 0.000229, "vq_loss_layer_007": 0.000299, "vq_loss_layer_008": 0.000315, "vq_loss_layer_009": 0.000439, "vq_loss_layer_010": 0.000381, "vq_loss_layer_011": 0.000431, "vq_loss_layer_012": 0.000801, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000713, "vq_loss_layer_015": 0.000763, "vq_loss_layer_016": 0.000664, "vq_loss_layer_017": 0.000652, "vq_loss_layer_018": 0.000381, "vq_loss_layer_019": 0.000334, "vq_loss_layer_020": 0.000359, "vq_loss_layer_021": 0.000782, "vq_loss_layer_022": 0.00045, "vq_loss_layer_023": 0.000511, "vq_loss_layer_024": 0.000471, "vq_loss_layer_025": 0.000591, "vq_loss_layer_026": 0.000698, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.008789 }, { "ce_loss": 2.304171, "epoch": 0.00165, "grad_norm": 0.006211706902831793, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.088379, "kv_mse_loss": 0.061804, "kv_vq_loss": 0.000869, "learning_rate": 0.0008043709860534765, "loss": 0.062665, "step": 1650, "value_mse_loss_layer_000": 0.000992, "value_mse_loss_layer_001": 0.002701, "value_mse_loss_layer_002": 0.011353, "value_mse_loss_layer_003": 0.017334, "value_mse_loss_layer_004": 0.01532, "value_mse_loss_layer_005": 0.015442, "value_mse_loss_layer_006": 0.018311, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.025513, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.026733, "value_mse_loss_layer_011": 0.029419, "value_mse_loss_layer_012": 0.030151, "value_mse_loss_layer_013": 0.030884, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.036621, "value_mse_loss_layer_016": 0.032715, "value_mse_loss_layer_017": 0.034668, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.041504, "value_mse_loss_layer_020": 0.042969, "value_mse_loss_layer_021": 0.051025, "value_mse_loss_layer_022": 0.044434, "value_mse_loss_layer_023": 0.059814, "value_mse_loss_layer_024": 0.056396, "value_mse_loss_layer_025": 0.074219, "value_mse_loss_layer_026": 0.0625, "value_mse_loss_layer_027": 0.087402, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.121094, "value_mse_loss_layer_030": 0.100586, "value_mse_loss_layer_031": 0.125977, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 9.4e-05, "vq_loss_layer_005": 0.000121, "vq_loss_layer_006": 0.000192, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000349, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000359, "vq_loss_layer_012": 0.000637, "vq_loss_layer_013": 0.000469, "vq_loss_layer_014": 0.000603, "vq_loss_layer_015": 0.000641, "vq_loss_layer_016": 0.000671, "vq_loss_layer_017": 0.000538, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.000294, "vq_loss_layer_020": 0.000378, "vq_loss_layer_021": 0.000687, "vq_loss_layer_022": 0.000336, "vq_loss_layer_023": 0.000587, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000538, "vq_loss_layer_026": 0.00082, "vq_loss_layer_027": 0.001137, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.002625, "vq_loss_layer_030": 0.003281, "vq_loss_layer_031": 0.008545 }, { "ce_loss": 2.310333, "epoch": 0.00166, "grad_norm": 0.0062890066765248775, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.061145, "kv_vq_loss": 0.000864, "learning_rate": 0.0008050270220100136, "loss": 0.062009, "step": 1660, "value_mse_loss_layer_000": 0.000984, "value_mse_loss_layer_001": 0.002716, "value_mse_loss_layer_002": 0.010803, "value_mse_loss_layer_003": 0.016724, "value_mse_loss_layer_004": 0.015259, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.019165, "value_mse_loss_layer_007": 0.021484, "value_mse_loss_layer_008": 0.023926, "value_mse_loss_layer_009": 0.031006, "value_mse_loss_layer_010": 0.027832, "value_mse_loss_layer_011": 0.029419, "value_mse_loss_layer_012": 0.029541, "value_mse_loss_layer_013": 0.033447, "value_mse_loss_layer_014": 0.032471, "value_mse_loss_layer_015": 0.034668, "value_mse_loss_layer_016": 0.030762, "value_mse_loss_layer_017": 0.0354, "value_mse_loss_layer_018": 0.034912, "value_mse_loss_layer_019": 0.03833, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.049316, "value_mse_loss_layer_022": 0.059814, "value_mse_loss_layer_023": 0.051514, "value_mse_loss_layer_024": 0.05957, "value_mse_loss_layer_025": 0.069336, "value_mse_loss_layer_026": 0.056396, "value_mse_loss_layer_027": 0.079102, "value_mse_loss_layer_028": 0.100586, "value_mse_loss_layer_029": 0.115234, "value_mse_loss_layer_030": 0.100098, "value_mse_loss_layer_031": 0.132812, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.000127, "vq_loss_layer_006": 0.000244, "vq_loss_layer_007": 0.000364, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000357, "vq_loss_layer_010": 0.000303, "vq_loss_layer_011": 0.000364, "vq_loss_layer_012": 0.00061, "vq_loss_layer_013": 0.000595, "vq_loss_layer_014": 0.000572, "vq_loss_layer_015": 0.000587, "vq_loss_layer_016": 0.000591, "vq_loss_layer_017": 0.000607, "vq_loss_layer_018": 0.000351, "vq_loss_layer_019": 0.00023, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.000587, "vq_loss_layer_022": 0.000614, "vq_loss_layer_023": 0.000399, "vq_loss_layer_024": 0.000402, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000748, "vq_loss_layer_028": 0.001633, "vq_loss_layer_029": 0.001892, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.007996 }, { "ce_loss": 2.290747, "epoch": 0.00167, "grad_norm": 0.005989869590848684, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.061758, "kv_vq_loss": 0.000889, "learning_rate": 0.0008056791177868956, "loss": 0.06265, "step": 1670, "value_mse_loss_layer_000": 0.000992, "value_mse_loss_layer_001": 0.002747, "value_mse_loss_layer_002": 0.011108, "value_mse_loss_layer_003": 0.017822, "value_mse_loss_layer_004": 0.016479, "value_mse_loss_layer_005": 0.016724, "value_mse_loss_layer_006": 0.019165, "value_mse_loss_layer_007": 0.021606, "value_mse_loss_layer_008": 0.02478, "value_mse_loss_layer_009": 0.034668, "value_mse_loss_layer_010": 0.028687, "value_mse_loss_layer_011": 0.031006, "value_mse_loss_layer_012": 0.033203, "value_mse_loss_layer_013": 0.031738, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.034912, "value_mse_loss_layer_016": 0.032227, "value_mse_loss_layer_017": 0.034424, "value_mse_loss_layer_018": 0.034668, "value_mse_loss_layer_019": 0.039795, "value_mse_loss_layer_020": 0.041016, "value_mse_loss_layer_021": 0.045898, "value_mse_loss_layer_022": 0.046875, "value_mse_loss_layer_023": 0.053467, "value_mse_loss_layer_024": 0.057373, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.063477, "value_mse_loss_layer_027": 0.080566, "value_mse_loss_layer_028": 0.082031, "value_mse_loss_layer_029": 0.140625, "value_mse_loss_layer_030": 0.105469, "value_mse_loss_layer_031": 0.140625, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.000122, "vq_loss_layer_005": 0.000144, "vq_loss_layer_006": 0.00024, "vq_loss_layer_007": 0.000322, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.00053, "vq_loss_layer_010": 0.000391, "vq_loss_layer_011": 0.000443, "vq_loss_layer_012": 0.000866, "vq_loss_layer_013": 0.000542, "vq_loss_layer_014": 0.000614, "vq_loss_layer_015": 0.000618, "vq_loss_layer_016": 0.000721, "vq_loss_layer_017": 0.000561, "vq_loss_layer_018": 0.000441, "vq_loss_layer_019": 0.00032, "vq_loss_layer_020": 0.000416, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000496, "vq_loss_layer_023": 0.000437, "vq_loss_layer_024": 0.000534, "vq_loss_layer_025": 0.000648, "vq_loss_layer_026": 0.001045, "vq_loss_layer_027": 0.001099, "vq_loss_layer_028": 0.001419, "vq_loss_layer_029": 0.003571, "vq_loss_layer_030": 0.00322, "vq_loss_layer_031": 0.00946 }, { "ce_loss": 2.30404, "epoch": 0.00168, "grad_norm": 0.005014282185584307, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.061295, "kv_vq_loss": 0.000864, "learning_rate": 0.0008063273204314657, "loss": 0.062131, "step": 1680, "value_mse_loss_layer_000": 0.000992, "value_mse_loss_layer_001": 0.002701, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.01709, "value_mse_loss_layer_004": 0.015076, "value_mse_loss_layer_005": 0.015564, "value_mse_loss_layer_006": 0.018555, "value_mse_loss_layer_007": 0.02063, "value_mse_loss_layer_008": 0.024292, "value_mse_loss_layer_009": 0.032227, "value_mse_loss_layer_010": 0.027832, "value_mse_loss_layer_011": 0.030884, "value_mse_loss_layer_012": 0.030151, "value_mse_loss_layer_013": 0.031982, "value_mse_loss_layer_014": 0.035645, "value_mse_loss_layer_015": 0.036133, "value_mse_loss_layer_016": 0.03125, "value_mse_loss_layer_017": 0.037109, "value_mse_loss_layer_018": 0.033447, "value_mse_loss_layer_019": 0.045654, "value_mse_loss_layer_020": 0.041504, "value_mse_loss_layer_021": 0.062256, "value_mse_loss_layer_022": 0.046631, "value_mse_loss_layer_023": 0.063965, "value_mse_loss_layer_024": 0.054688, "value_mse_loss_layer_025": 0.07373, "value_mse_loss_layer_026": 0.060547, "value_mse_loss_layer_027": 0.077148, "value_mse_loss_layer_028": 0.078125, "value_mse_loss_layer_029": 0.115234, "value_mse_loss_layer_030": 0.101562, "value_mse_loss_layer_031": 0.139648, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000197, "vq_loss_layer_007": 0.000286, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000397, "vq_loss_layer_010": 0.000332, "vq_loss_layer_011": 0.000448, "vq_loss_layer_012": 0.000618, "vq_loss_layer_013": 0.000511, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000793, "vq_loss_layer_016": 0.000603, "vq_loss_layer_017": 0.000786, "vq_loss_layer_018": 0.000313, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.000984, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000698, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000801, "vq_loss_layer_027": 0.000782, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.002335, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.008179 }, { "ce_loss": 2.269131, "epoch": 0.00169, "grad_norm": 0.00458180857822299, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.095215, "key_mse_loss_layer_009": 0.101074, "key_mse_loss_layer_010": 0.11377, "key_mse_loss_layer_011": 0.109863, "key_mse_loss_layer_012": 0.083984, "key_mse_loss_layer_013": 0.151367, "key_mse_loss_layer_014": 0.146484, "key_mse_loss_layer_015": 0.131836, "key_mse_loss_layer_016": 0.128906, "key_mse_loss_layer_017": 0.125, "key_mse_loss_layer_018": 0.132812, "key_mse_loss_layer_019": 0.101562, "key_mse_loss_layer_020": 0.118164, "key_mse_loss_layer_021": 0.112305, "key_mse_loss_layer_022": 0.121094, "key_mse_loss_layer_023": 0.118164, "key_mse_loss_layer_024": 0.095703, "key_mse_loss_layer_025": 0.085938, "key_mse_loss_layer_026": 0.103027, "key_mse_loss_layer_027": 0.099121, "key_mse_loss_layer_028": 0.105957, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.10791, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.061441, "kv_vq_loss": 0.000881, "learning_rate": 0.0008069716761534183, "loss": 0.062305, "step": 1690, "value_mse_loss_layer_000": 0.000999, "value_mse_loss_layer_001": 0.002701, "value_mse_loss_layer_002": 0.011047, "value_mse_loss_layer_003": 0.018066, "value_mse_loss_layer_004": 0.016602, "value_mse_loss_layer_005": 0.019653, "value_mse_loss_layer_006": 0.018921, "value_mse_loss_layer_007": 0.022217, "value_mse_loss_layer_008": 0.023926, "value_mse_loss_layer_009": 0.030151, "value_mse_loss_layer_010": 0.0271, "value_mse_loss_layer_011": 0.028931, "value_mse_loss_layer_012": 0.031982, "value_mse_loss_layer_013": 0.030518, "value_mse_loss_layer_014": 0.033691, "value_mse_loss_layer_015": 0.03064, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.030884, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.038818, "value_mse_loss_layer_021": 0.048828, "value_mse_loss_layer_022": 0.039551, "value_mse_loss_layer_023": 0.047607, "value_mse_loss_layer_024": 0.049805, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.051758, "value_mse_loss_layer_027": 0.086914, "value_mse_loss_layer_028": 0.067871, "value_mse_loss_layer_029": 0.098145, "value_mse_loss_layer_030": 0.097656, "value_mse_loss_layer_031": 0.136719, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 2.6e-05, "vq_loss_layer_002": 3.3e-05, "vq_loss_layer_003": 6.3e-05, "vq_loss_layer_004": 0.000141, "vq_loss_layer_005": 0.000284, "vq_loss_layer_006": 0.000246, "vq_loss_layer_007": 0.000359, "vq_loss_layer_008": 0.000399, "vq_loss_layer_009": 0.000422, "vq_loss_layer_010": 0.000484, "vq_loss_layer_011": 0.000469, "vq_loss_layer_012": 0.000969, "vq_loss_layer_013": 0.000504, "vq_loss_layer_014": 0.000847, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000576, "vq_loss_layer_017": 0.00058, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000275, "vq_loss_layer_020": 0.000368, "vq_loss_layer_021": 0.000992, "vq_loss_layer_022": 0.000439, "vq_loss_layer_023": 0.000576, "vq_loss_layer_024": 0.000504, "vq_loss_layer_025": 0.000771, "vq_loss_layer_026": 0.000896, "vq_loss_layer_027": 0.001511, "vq_loss_layer_028": 0.001312, "vq_loss_layer_029": 0.002289, "vq_loss_layer_030": 0.003998, "vq_loss_layer_031": 0.010803 }, { "ce_loss": 2.280856, "epoch": 0.0017, "grad_norm": 0.006179137155413628, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.061307, "kv_vq_loss": 0.00084, "learning_rate": 0.0008076122303445683, "loss": 0.06214, "step": 1700, "value_mse_loss_layer_000": 0.000999, "value_mse_loss_layer_001": 0.002716, "value_mse_loss_layer_002": 0.010803, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.015198, "value_mse_loss_layer_005": 0.015991, "value_mse_loss_layer_006": 0.019165, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.024292, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.027466, "value_mse_loss_layer_011": 0.030273, "value_mse_loss_layer_012": 0.030273, "value_mse_loss_layer_013": 0.03125, "value_mse_loss_layer_014": 0.032715, "value_mse_loss_layer_015": 0.034424, "value_mse_loss_layer_016": 0.031494, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.038574, "value_mse_loss_layer_020": 0.042725, "value_mse_loss_layer_021": 0.049561, "value_mse_loss_layer_022": 0.049561, "value_mse_loss_layer_023": 0.052002, "value_mse_loss_layer_024": 0.063965, "value_mse_loss_layer_025": 0.075195, "value_mse_loss_layer_026": 0.056152, "value_mse_loss_layer_027": 0.082031, "value_mse_loss_layer_028": 0.079102, "value_mse_loss_layer_029": 0.12793, "value_mse_loss_layer_030": 0.099609, "value_mse_loss_layer_031": 0.128906, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 9.1e-05, "vq_loss_layer_005": 0.000127, "vq_loss_layer_006": 0.000239, "vq_loss_layer_007": 0.00029, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.00038, "vq_loss_layer_010": 0.000332, "vq_loss_layer_011": 0.00041, "vq_loss_layer_012": 0.000633, "vq_loss_layer_013": 0.000515, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000587, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000622, "vq_loss_layer_018": 0.00034, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000385, "vq_loss_layer_021": 0.000603, "vq_loss_layer_022": 0.00041, "vq_loss_layer_023": 0.000381, "vq_loss_layer_024": 0.000437, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000942, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.00193, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.007996 }, { "ce_loss": 2.298622, "epoch": 0.00171, "grad_norm": 0.004995394963771105, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.061472, "kv_vq_loss": 0.000858, "learning_rate": 0.0008082490275980384, "loss": 0.06232, "step": 1710, "value_mse_loss_layer_000": 0.000969, "value_mse_loss_layer_001": 0.002701, "value_mse_loss_layer_002": 0.010742, "value_mse_loss_layer_003": 0.017212, "value_mse_loss_layer_004": 0.015747, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.018311, "value_mse_loss_layer_007": 0.02063, "value_mse_loss_layer_008": 0.023804, "value_mse_loss_layer_009": 0.031494, "value_mse_loss_layer_010": 0.026733, "value_mse_loss_layer_011": 0.028198, "value_mse_loss_layer_012": 0.029053, "value_mse_loss_layer_013": 0.030151, "value_mse_loss_layer_014": 0.032471, "value_mse_loss_layer_015": 0.041748, "value_mse_loss_layer_016": 0.030518, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.031738, "value_mse_loss_layer_019": 0.038574, "value_mse_loss_layer_020": 0.039795, "value_mse_loss_layer_021": 0.051025, "value_mse_loss_layer_022": 0.045898, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.062012, "value_mse_loss_layer_025": 0.071777, "value_mse_loss_layer_026": 0.058594, "value_mse_loss_layer_027": 0.080078, "value_mse_loss_layer_028": 0.080078, "value_mse_loss_layer_029": 0.122559, "value_mse_loss_layer_030": 0.119141, "value_mse_loss_layer_031": 0.134766, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000193, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000309, "vq_loss_layer_009": 0.000401, "vq_loss_layer_010": 0.00034, "vq_loss_layer_011": 0.000366, "vq_loss_layer_012": 0.000614, "vq_loss_layer_013": 0.000483, "vq_loss_layer_014": 0.000614, "vq_loss_layer_015": 0.001106, "vq_loss_layer_016": 0.00069, "vq_loss_layer_017": 0.000504, "vq_loss_layer_018": 0.000326, "vq_loss_layer_019": 0.00029, "vq_loss_layer_020": 0.000332, "vq_loss_layer_021": 0.000744, "vq_loss_layer_022": 0.000423, "vq_loss_layer_023": 0.000412, "vq_loss_layer_024": 0.000473, "vq_loss_layer_025": 0.000553, "vq_loss_layer_026": 0.000725, "vq_loss_layer_027": 0.000889, "vq_loss_layer_028": 0.001381, "vq_loss_layer_029": 0.002548, "vq_loss_layer_030": 0.003647, "vq_loss_layer_031": 0.009277 }, { "ce_loss": 2.328642, "epoch": 0.00172, "grad_norm": 0.005454800557345152, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.061111, "kv_vq_loss": 0.000849, "learning_rate": 0.0008088821117268871, "loss": 0.061932, "step": 1720, "value_mse_loss_layer_000": 0.000946, "value_mse_loss_layer_001": 0.00264, "value_mse_loss_layer_002": 0.011169, "value_mse_loss_layer_003": 0.017334, "value_mse_loss_layer_004": 0.014893, "value_mse_loss_layer_005": 0.015137, "value_mse_loss_layer_006": 0.017822, "value_mse_loss_layer_007": 0.021118, "value_mse_loss_layer_008": 0.024292, "value_mse_loss_layer_009": 0.031982, "value_mse_loss_layer_010": 0.026489, "value_mse_loss_layer_011": 0.029663, "value_mse_loss_layer_012": 0.033203, "value_mse_loss_layer_013": 0.030884, "value_mse_loss_layer_014": 0.037598, "value_mse_loss_layer_015": 0.036621, "value_mse_loss_layer_016": 0.029541, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.035645, "value_mse_loss_layer_019": 0.040771, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.048584, "value_mse_loss_layer_022": 0.044678, "value_mse_loss_layer_023": 0.054443, "value_mse_loss_layer_024": 0.059814, "value_mse_loss_layer_025": 0.075195, "value_mse_loss_layer_026": 0.057861, "value_mse_loss_layer_027": 0.076172, "value_mse_loss_layer_028": 0.075195, "value_mse_loss_layer_029": 0.113281, "value_mse_loss_layer_030": 0.113281, "value_mse_loss_layer_031": 0.131836, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 0.00011, "vq_loss_layer_006": 0.000186, "vq_loss_layer_007": 0.000332, "vq_loss_layer_008": 0.000299, "vq_loss_layer_009": 0.000431, "vq_loss_layer_010": 0.000296, "vq_loss_layer_011": 0.000433, "vq_loss_layer_012": 0.000961, "vq_loss_layer_013": 0.000475, "vq_loss_layer_014": 0.00082, "vq_loss_layer_015": 0.00066, "vq_loss_layer_016": 0.000576, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000374, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000462, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.00071, "vq_loss_layer_027": 0.000698, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001785, "vq_loss_layer_030": 0.00293, "vq_loss_layer_031": 0.007538 }, { "ce_loss": 2.285927, "epoch": 0.00173, "grad_norm": 0.006310728378593922, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.061087, "kv_vq_loss": 0.000863, "learning_rate": 0.0008095115257821987, "loss": 0.061929, "step": 1730, "value_mse_loss_layer_000": 0.001007, "value_mse_loss_layer_001": 0.002747, "value_mse_loss_layer_002": 0.010681, "value_mse_loss_layer_003": 0.01709, "value_mse_loss_layer_004": 0.019043, "value_mse_loss_layer_005": 0.016113, "value_mse_loss_layer_006": 0.018066, "value_mse_loss_layer_007": 0.020752, "value_mse_loss_layer_008": 0.02417, "value_mse_loss_layer_009": 0.030518, "value_mse_loss_layer_010": 0.03125, "value_mse_loss_layer_011": 0.028564, "value_mse_loss_layer_012": 0.029175, "value_mse_loss_layer_013": 0.029907, "value_mse_loss_layer_014": 0.031738, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.031494, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.036621, "value_mse_loss_layer_020": 0.040771, "value_mse_loss_layer_021": 0.052246, "value_mse_loss_layer_022": 0.04541, "value_mse_loss_layer_023": 0.061768, "value_mse_loss_layer_024": 0.054688, "value_mse_loss_layer_025": 0.075684, "value_mse_loss_layer_026": 0.067383, "value_mse_loss_layer_027": 0.088867, "value_mse_loss_layer_028": 0.07959, "value_mse_loss_layer_029": 0.11377, "value_mse_loss_layer_030": 0.101074, "value_mse_loss_layer_031": 0.132812, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 0.000198, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000185, "vq_loss_layer_007": 0.000319, "vq_loss_layer_008": 0.00033, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.000416, "vq_loss_layer_011": 0.000391, "vq_loss_layer_012": 0.000622, "vq_loss_layer_013": 0.000534, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000618, "vq_loss_layer_016": 0.000721, "vq_loss_layer_017": 0.00058, "vq_loss_layer_018": 0.000366, "vq_loss_layer_019": 0.000265, "vq_loss_layer_020": 0.000326, "vq_loss_layer_021": 0.00079, "vq_loss_layer_022": 0.000391, "vq_loss_layer_023": 0.000568, "vq_loss_layer_024": 0.000406, "vq_loss_layer_025": 0.000587, "vq_loss_layer_026": 0.001091, "vq_loss_layer_027": 0.001129, "vq_loss_layer_028": 0.001297, "vq_loss_layer_029": 0.002136, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.00885 }, { "ce_loss": 2.275793, "epoch": 0.00174, "grad_norm": 0.005109087098389864, "key_mse_loss_layer_000": 0.002563, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07373, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.059082, "kv_mse_loss": 0.061481, "kv_vq_loss": 0.00084, "learning_rate": 0.0008101373120706499, "loss": 0.062317, "step": 1740, "value_mse_loss_layer_000": 0.000923, "value_mse_loss_layer_001": 0.002579, "value_mse_loss_layer_002": 0.010742, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.015869, "value_mse_loss_layer_005": 0.015869, "value_mse_loss_layer_006": 0.019287, "value_mse_loss_layer_007": 0.021118, "value_mse_loss_layer_008": 0.024292, "value_mse_loss_layer_009": 0.03418, "value_mse_loss_layer_010": 0.030029, "value_mse_loss_layer_011": 0.030396, "value_mse_loss_layer_012": 0.030396, "value_mse_loss_layer_013": 0.031982, "value_mse_loss_layer_014": 0.032959, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.030396, "value_mse_loss_layer_017": 0.034912, "value_mse_loss_layer_018": 0.029541, "value_mse_loss_layer_019": 0.039795, "value_mse_loss_layer_020": 0.039307, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.040283, "value_mse_loss_layer_023": 0.045654, "value_mse_loss_layer_024": 0.055176, "value_mse_loss_layer_025": 0.069824, "value_mse_loss_layer_026": 0.054199, "value_mse_loss_layer_027": 0.072754, "value_mse_loss_layer_028": 0.067383, "value_mse_loss_layer_029": 0.099609, "value_mse_loss_layer_030": 0.091309, "value_mse_loss_layer_031": 0.130859, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 3.2e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 0.00012, "vq_loss_layer_005": 0.000148, "vq_loss_layer_006": 0.000273, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.000364, "vq_loss_layer_009": 0.000538, "vq_loss_layer_010": 0.000463, "vq_loss_layer_011": 0.000465, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.000534, "vq_loss_layer_014": 0.000736, "vq_loss_layer_015": 0.000664, "vq_loss_layer_016": 0.000721, "vq_loss_layer_017": 0.000668, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000423, "vq_loss_layer_020": 0.000404, "vq_loss_layer_021": 0.00074, "vq_loss_layer_022": 0.000488, "vq_loss_layer_023": 0.000546, "vq_loss_layer_024": 0.000656, "vq_loss_layer_025": 0.000908, "vq_loss_layer_026": 0.001122, "vq_loss_layer_027": 0.001305, "vq_loss_layer_028": 0.001541, "vq_loss_layer_029": 0.00209, "vq_loss_layer_030": 0.003403, "vq_loss_layer_031": 0.010498 }, { "ce_loss": 2.273308, "epoch": 0.00175, "grad_norm": 0.005847657099366188, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.060797, "kv_vq_loss": 0.00085, "learning_rate": 0.0008107595121715735, "loss": 0.061624, "step": 1750, "value_mse_loss_layer_000": 0.000954, "value_mse_loss_layer_001": 0.002655, "value_mse_loss_layer_002": 0.010742, "value_mse_loss_layer_003": 0.018555, "value_mse_loss_layer_004": 0.016479, "value_mse_loss_layer_005": 0.016113, "value_mse_loss_layer_006": 0.018555, "value_mse_loss_layer_007": 0.020874, "value_mse_loss_layer_008": 0.023438, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.027344, "value_mse_loss_layer_011": 0.028564, "value_mse_loss_layer_012": 0.032227, "value_mse_loss_layer_013": 0.031494, "value_mse_loss_layer_014": 0.036133, "value_mse_loss_layer_015": 0.032715, "value_mse_loss_layer_016": 0.028809, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.030396, "value_mse_loss_layer_019": 0.037842, "value_mse_loss_layer_020": 0.038574, "value_mse_loss_layer_021": 0.045166, "value_mse_loss_layer_022": 0.049561, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.07373, "value_mse_loss_layer_026": 0.052734, "value_mse_loss_layer_027": 0.07373, "value_mse_loss_layer_028": 0.07666, "value_mse_loss_layer_029": 0.102539, "value_mse_loss_layer_030": 0.102539, "value_mse_loss_layer_031": 0.138672, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 6.1e-05, "vq_loss_layer_004": 0.000109, "vq_loss_layer_005": 0.000141, "vq_loss_layer_006": 0.000229, "vq_loss_layer_007": 0.000305, "vq_loss_layer_008": 0.00034, "vq_loss_layer_009": 0.000483, "vq_loss_layer_010": 0.000368, "vq_loss_layer_011": 0.000431, "vq_loss_layer_012": 0.000908, "vq_loss_layer_013": 0.000576, "vq_loss_layer_014": 0.000801, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000652, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.000313, "vq_loss_layer_019": 0.00032, "vq_loss_layer_020": 0.000338, "vq_loss_layer_021": 0.000763, "vq_loss_layer_022": 0.000679, "vq_loss_layer_023": 0.000484, "vq_loss_layer_024": 0.00042, "vq_loss_layer_025": 0.000767, "vq_loss_layer_026": 0.00079, "vq_loss_layer_027": 0.000938, "vq_loss_layer_028": 0.001869, "vq_loss_layer_029": 0.002014, "vq_loss_layer_030": 0.003235, "vq_loss_layer_031": 0.010315 }, { "ce_loss": 2.264417, "epoch": 0.00176, "grad_norm": 0.006859702058136463, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.060547, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.09375, "key_mse_loss_layer_027": 0.097656, "key_mse_loss_layer_028": 0.100586, "key_mse_loss_layer_029": 0.094727, "key_mse_loss_layer_030": 0.100098, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.061456, "kv_vq_loss": 0.000854, "learning_rate": 0.0008113781669535372, "loss": 0.062299, "step": 1760, "value_mse_loss_layer_000": 0.000938, "value_mse_loss_layer_001": 0.002655, "value_mse_loss_layer_002": 0.012451, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.016113, "value_mse_loss_layer_005": 0.015869, "value_mse_loss_layer_006": 0.017578, "value_mse_loss_layer_007": 0.020264, "value_mse_loss_layer_008": 0.02356, "value_mse_loss_layer_009": 0.028809, "value_mse_loss_layer_010": 0.025146, "value_mse_loss_layer_011": 0.0271, "value_mse_loss_layer_012": 0.027832, "value_mse_loss_layer_013": 0.027344, "value_mse_loss_layer_014": 0.030884, "value_mse_loss_layer_015": 0.029297, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.03064, "value_mse_loss_layer_018": 0.033203, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.040283, "value_mse_loss_layer_021": 0.04541, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.077637, "value_mse_loss_layer_024": 0.055908, "value_mse_loss_layer_025": 0.078125, "value_mse_loss_layer_026": 0.070801, "value_mse_loss_layer_027": 0.083496, "value_mse_loss_layer_028": 0.078613, "value_mse_loss_layer_029": 0.125977, "value_mse_loss_layer_030": 0.110352, "value_mse_loss_layer_031": 0.145508, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 2.4e-05, "vq_loss_layer_002": 2.9e-05, "vq_loss_layer_003": 5e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 0.000119, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.000277, "vq_loss_layer_008": 0.000338, "vq_loss_layer_009": 0.000387, "vq_loss_layer_010": 0.000338, "vq_loss_layer_011": 0.000391, "vq_loss_layer_012": 0.000599, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.000656, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000549, "vq_loss_layer_017": 0.000538, "vq_loss_layer_018": 0.000406, "vq_loss_layer_019": 0.000286, "vq_loss_layer_020": 0.000303, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000374, "vq_loss_layer_023": 0.00079, "vq_loss_layer_024": 0.000414, "vq_loss_layer_025": 0.000732, "vq_loss_layer_026": 0.001549, "vq_loss_layer_027": 0.001015, "vq_loss_layer_028": 0.001549, "vq_loss_layer_029": 0.003494, "vq_loss_layer_030": 0.004089, "vq_loss_layer_031": 0.011536 }, { "ce_loss": 2.308928, "epoch": 0.00177, "grad_norm": 0.005844578612595797, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.060815, "kv_vq_loss": 0.000842, "learning_rate": 0.0008119933165904514, "loss": 0.061636, "step": 1770, "value_mse_loss_layer_000": 0.000999, "value_mse_loss_layer_001": 0.002625, "value_mse_loss_layer_002": 0.011047, "value_mse_loss_layer_003": 0.017822, "value_mse_loss_layer_004": 0.0177, "value_mse_loss_layer_005": 0.015625, "value_mse_loss_layer_006": 0.018677, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.024902, "value_mse_loss_layer_009": 0.031494, "value_mse_loss_layer_010": 0.02832, "value_mse_loss_layer_011": 0.030762, "value_mse_loss_layer_012": 0.029785, "value_mse_loss_layer_013": 0.03125, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.036133, "value_mse_loss_layer_016": 0.032227, "value_mse_loss_layer_017": 0.034424, "value_mse_loss_layer_018": 0.031982, "value_mse_loss_layer_019": 0.039062, "value_mse_loss_layer_020": 0.039307, "value_mse_loss_layer_021": 0.04834, "value_mse_loss_layer_022": 0.044434, "value_mse_loss_layer_023": 0.053467, "value_mse_loss_layer_024": 0.05542, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.05957, "value_mse_loss_layer_027": 0.089355, "value_mse_loss_layer_028": 0.078613, "value_mse_loss_layer_029": 0.115723, "value_mse_loss_layer_030": 0.097168, "value_mse_loss_layer_031": 0.129883, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 0.000175, "vq_loss_layer_005": 0.000117, "vq_loss_layer_006": 0.000207, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000341, "vq_loss_layer_009": 0.000385, "vq_loss_layer_010": 0.000364, "vq_loss_layer_011": 0.0005, "vq_loss_layer_012": 0.000591, "vq_loss_layer_013": 0.0005, "vq_loss_layer_014": 0.000633, "vq_loss_layer_015": 0.000698, "vq_loss_layer_016": 0.000683, "vq_loss_layer_017": 0.000572, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000267, "vq_loss_layer_020": 0.000311, "vq_loss_layer_021": 0.000675, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.0005, "vq_loss_layer_024": 0.000422, "vq_loss_layer_025": 0.000553, "vq_loss_layer_026": 0.000828, "vq_loss_layer_027": 0.001144, "vq_loss_layer_028": 0.001259, "vq_loss_layer_029": 0.001839, "vq_loss_layer_030": 0.003616, "vq_loss_layer_031": 0.007996 }, { "ce_loss": 2.269924, "epoch": 0.00178, "grad_norm": 0.006869825068861246, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.061557, "kv_vq_loss": 0.000881, "learning_rate": 0.0008126050005772234, "loss": 0.062436, "step": 1780, "value_mse_loss_layer_000": 0.000961, "value_mse_loss_layer_001": 0.002701, "value_mse_loss_layer_002": 0.010864, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.015198, "value_mse_loss_layer_005": 0.016113, "value_mse_loss_layer_006": 0.018433, "value_mse_loss_layer_007": 0.020386, "value_mse_loss_layer_008": 0.023438, "value_mse_loss_layer_009": 0.031738, "value_mse_loss_layer_010": 0.026611, "value_mse_loss_layer_011": 0.028076, "value_mse_loss_layer_012": 0.033447, "value_mse_loss_layer_013": 0.030151, "value_mse_loss_layer_014": 0.033936, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.029297, "value_mse_loss_layer_017": 0.034424, "value_mse_loss_layer_018": 0.032715, "value_mse_loss_layer_019": 0.038818, "value_mse_loss_layer_020": 0.046143, "value_mse_loss_layer_021": 0.050537, "value_mse_loss_layer_022": 0.043213, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.070312, "value_mse_loss_layer_025": 0.078125, "value_mse_loss_layer_026": 0.05542, "value_mse_loss_layer_027": 0.080078, "value_mse_loss_layer_028": 0.074707, "value_mse_loss_layer_029": 0.106445, "value_mse_loss_layer_030": 0.099609, "value_mse_loss_layer_031": 0.140625, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000206, "vq_loss_layer_007": 0.000296, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000425, "vq_loss_layer_010": 0.000294, "vq_loss_layer_011": 0.000345, "vq_loss_layer_012": 0.000858, "vq_loss_layer_013": 0.000483, "vq_loss_layer_014": 0.000721, "vq_loss_layer_015": 0.000603, "vq_loss_layer_016": 0.000557, "vq_loss_layer_017": 0.000549, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.00029, "vq_loss_layer_020": 0.00037, "vq_loss_layer_021": 0.000656, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000538, "vq_loss_layer_025": 0.000523, "vq_loss_layer_026": 0.000675, "vq_loss_layer_027": 0.000874, "vq_loss_layer_028": 0.00116, "vq_loss_layer_029": 0.001747, "vq_loss_layer_030": 0.003021, "vq_loss_layer_031": 0.009216 }, { "ce_loss": 2.27603, "epoch": 0.00179, "grad_norm": 0.0035792673006653786, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.060791, "kv_vq_loss": 0.00085, "learning_rate": 0.0008132132577449733, "loss": 0.06163, "step": 1790, "value_mse_loss_layer_000": 0.000965, "value_mse_loss_layer_001": 0.00264, "value_mse_loss_layer_002": 0.010559, "value_mse_loss_layer_003": 0.016724, "value_mse_loss_layer_004": 0.015259, "value_mse_loss_layer_005": 0.015625, "value_mse_loss_layer_006": 0.018066, "value_mse_loss_layer_007": 0.02063, "value_mse_loss_layer_008": 0.023804, "value_mse_loss_layer_009": 0.030518, "value_mse_loss_layer_010": 0.026367, "value_mse_loss_layer_011": 0.028687, "value_mse_loss_layer_012": 0.031494, "value_mse_loss_layer_013": 0.030762, "value_mse_loss_layer_014": 0.032471, "value_mse_loss_layer_015": 0.033936, "value_mse_loss_layer_016": 0.030151, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.037354, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.049316, "value_mse_loss_layer_022": 0.044434, "value_mse_loss_layer_023": 0.051025, "value_mse_loss_layer_024": 0.053467, "value_mse_loss_layer_025": 0.067383, "value_mse_loss_layer_026": 0.055664, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.075195, "value_mse_loss_layer_029": 0.109375, "value_mse_loss_layer_030": 0.098145, "value_mse_loss_layer_031": 0.131836, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000191, "vq_loss_layer_007": 0.000286, "vq_loss_layer_008": 0.000298, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000319, "vq_loss_layer_011": 0.000391, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.0005, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000599, "vq_loss_layer_016": 0.000618, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.000334, "vq_loss_layer_019": 0.00032, "vq_loss_layer_020": 0.000372, "vq_loss_layer_021": 0.000721, "vq_loss_layer_022": 0.000368, "vq_loss_layer_023": 0.000454, "vq_loss_layer_024": 0.000378, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000759, "vq_loss_layer_027": 0.000816, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.002121, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.008728 }, { "ce_loss": 2.310793, "epoch": 0.0018, "grad_norm": 0.005204883404076099, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.060687, "kv_vq_loss": 0.000826, "learning_rate": 0.0008138181262758264, "loss": 0.061499, "step": 1800, "value_mse_loss_layer_000": 0.000961, "value_mse_loss_layer_001": 0.002594, "value_mse_loss_layer_002": 0.010376, "value_mse_loss_layer_003": 0.018311, "value_mse_loss_layer_004": 0.014526, "value_mse_loss_layer_005": 0.014893, "value_mse_loss_layer_006": 0.018311, "value_mse_loss_layer_007": 0.020752, "value_mse_loss_layer_008": 0.023926, "value_mse_loss_layer_009": 0.03418, "value_mse_loss_layer_010": 0.02771, "value_mse_loss_layer_011": 0.030396, "value_mse_loss_layer_012": 0.031006, "value_mse_loss_layer_013": 0.031006, "value_mse_loss_layer_014": 0.032715, "value_mse_loss_layer_015": 0.0354, "value_mse_loss_layer_016": 0.030151, "value_mse_loss_layer_017": 0.034668, "value_mse_loss_layer_018": 0.033691, "value_mse_loss_layer_019": 0.037598, "value_mse_loss_layer_020": 0.039062, "value_mse_loss_layer_021": 0.048828, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.063477, "value_mse_loss_layer_024": 0.052734, "value_mse_loss_layer_025": 0.068848, "value_mse_loss_layer_026": 0.056641, "value_mse_loss_layer_027": 0.071777, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.122559, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 0.000104, "vq_loss_layer_005": 0.000135, "vq_loss_layer_006": 0.00021, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.000305, "vq_loss_layer_009": 0.000603, "vq_loss_layer_010": 0.000336, "vq_loss_layer_011": 0.000488, "vq_loss_layer_012": 0.000675, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000614, "vq_loss_layer_015": 0.00071, "vq_loss_layer_016": 0.00058, "vq_loss_layer_017": 0.000813, "vq_loss_layer_018": 0.000385, "vq_loss_layer_019": 0.000267, "vq_loss_layer_020": 0.000364, "vq_loss_layer_021": 0.00071, "vq_loss_layer_022": 0.00045, "vq_loss_layer_023": 0.000751, "vq_loss_layer_024": 0.00038, "vq_loss_layer_025": 0.000542, "vq_loss_layer_026": 0.000801, "vq_loss_layer_027": 0.000771, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.003281, "vq_loss_layer_031": 0.006989 }, { "ce_loss": 2.291148, "epoch": 0.00181, "grad_norm": 0.005432142876088619, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.061267, "kv_vq_loss": 0.000872, "learning_rate": 0.000814419643717296, "loss": 0.062125, "step": 1810, "value_mse_loss_layer_000": 0.000954, "value_mse_loss_layer_001": 0.00264, "value_mse_loss_layer_002": 0.011597, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.014832, "value_mse_loss_layer_005": 0.018433, "value_mse_loss_layer_006": 0.018311, "value_mse_loss_layer_007": 0.02063, "value_mse_loss_layer_008": 0.02417, "value_mse_loss_layer_009": 0.031494, "value_mse_loss_layer_010": 0.027466, "value_mse_loss_layer_011": 0.028931, "value_mse_loss_layer_012": 0.029907, "value_mse_loss_layer_013": 0.03125, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.035156, "value_mse_loss_layer_016": 0.03125, "value_mse_loss_layer_017": 0.034424, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.037598, "value_mse_loss_layer_020": 0.039551, "value_mse_loss_layer_021": 0.049072, "value_mse_loss_layer_022": 0.044678, "value_mse_loss_layer_023": 0.061768, "value_mse_loss_layer_024": 0.057861, "value_mse_loss_layer_025": 0.072754, "value_mse_loss_layer_026": 0.055908, "value_mse_loss_layer_027": 0.077148, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.108887, "value_mse_loss_layer_030": 0.106445, "value_mse_loss_layer_031": 0.129883, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.000107, "vq_loss_layer_005": 0.000198, "vq_loss_layer_006": 0.000203, "vq_loss_layer_007": 0.000301, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000416, "vq_loss_layer_010": 0.000364, "vq_loss_layer_011": 0.000374, "vq_loss_layer_012": 0.000591, "vq_loss_layer_013": 0.000504, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.000626, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000523, "vq_loss_layer_018": 0.000347, "vq_loss_layer_019": 0.000275, "vq_loss_layer_020": 0.000307, "vq_loss_layer_021": 0.000607, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000599, "vq_loss_layer_024": 0.000425, "vq_loss_layer_025": 0.000587, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000835, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.00206, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.008118 }, { "ce_loss": 2.241967, "epoch": 0.00182, "grad_norm": 0.00497777434065938, "key_mse_loss_layer_000": 0.003601, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.061063, "kv_vq_loss": 0.000867, "learning_rate": 0.0008150178469962686, "loss": 0.061932, "step": 1820, "value_mse_loss_layer_000": 0.000977, "value_mse_loss_layer_001": 0.002701, "value_mse_loss_layer_002": 0.01062, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.015076, "value_mse_loss_layer_005": 0.01532, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.020142, "value_mse_loss_layer_008": 0.023926, "value_mse_loss_layer_009": 0.030273, "value_mse_loss_layer_010": 0.026611, "value_mse_loss_layer_011": 0.029175, "value_mse_loss_layer_012": 0.034424, "value_mse_loss_layer_013": 0.030151, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.034424, "value_mse_loss_layer_016": 0.029663, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.037842, "value_mse_loss_layer_020": 0.039795, "value_mse_loss_layer_021": 0.053955, "value_mse_loss_layer_022": 0.052734, "value_mse_loss_layer_023": 0.054443, "value_mse_loss_layer_024": 0.062988, "value_mse_loss_layer_025": 0.07373, "value_mse_loss_layer_026": 0.057861, "value_mse_loss_layer_027": 0.07959, "value_mse_loss_layer_028": 0.079102, "value_mse_loss_layer_029": 0.121094, "value_mse_loss_layer_030": 0.10791, "value_mse_loss_layer_031": 0.132812, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000229, "vq_loss_layer_007": 0.00029, "vq_loss_layer_008": 0.000292, "vq_loss_layer_009": 0.000362, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000422, "vq_loss_layer_012": 0.001007, "vq_loss_layer_013": 0.000488, "vq_loss_layer_014": 0.000645, "vq_loss_layer_015": 0.000641, "vq_loss_layer_016": 0.000648, "vq_loss_layer_017": 0.000534, "vq_loss_layer_018": 0.000307, "vq_loss_layer_019": 0.000269, "vq_loss_layer_020": 0.000317, "vq_loss_layer_021": 0.00074, "vq_loss_layer_022": 0.000484, "vq_loss_layer_023": 0.000462, "vq_loss_layer_024": 0.000467, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000698, "vq_loss_layer_027": 0.000771, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.007874 }, { "ce_loss": 2.315694, "epoch": 0.00183, "grad_norm": 0.0050158193334937096, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.060605, "kv_vq_loss": 0.000844, "learning_rate": 0.0008156127724326072, "loss": 0.061423, "step": 1830, "value_mse_loss_layer_000": 0.000938, "value_mse_loss_layer_001": 0.002625, "value_mse_loss_layer_002": 0.010498, "value_mse_loss_layer_003": 0.016357, "value_mse_loss_layer_004": 0.014587, "value_mse_loss_layer_005": 0.014709, "value_mse_loss_layer_006": 0.017944, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.023804, "value_mse_loss_layer_009": 0.032471, "value_mse_loss_layer_010": 0.026001, "value_mse_loss_layer_011": 0.028198, "value_mse_loss_layer_012": 0.030273, "value_mse_loss_layer_013": 0.029297, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.029541, "value_mse_loss_layer_017": 0.033447, "value_mse_loss_layer_018": 0.03125, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.03833, "value_mse_loss_layer_021": 0.047119, "value_mse_loss_layer_022": 0.045898, "value_mse_loss_layer_023": 0.049316, "value_mse_loss_layer_024": 0.05249, "value_mse_loss_layer_025": 0.073242, "value_mse_loss_layer_026": 0.052002, "value_mse_loss_layer_027": 0.070801, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.10791, "value_mse_loss_layer_030": 0.094238, "value_mse_loss_layer_031": 0.134766, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 0.000108, "vq_loss_layer_006": 0.0002, "vq_loss_layer_007": 0.000309, "vq_loss_layer_008": 0.000315, "vq_loss_layer_009": 0.000484, "vq_loss_layer_010": 0.000298, "vq_loss_layer_011": 0.000359, "vq_loss_layer_012": 0.000759, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.00061, "vq_loss_layer_015": 0.000648, "vq_loss_layer_016": 0.000591, "vq_loss_layer_017": 0.000683, "vq_loss_layer_018": 0.000324, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000706, "vq_loss_layer_022": 0.000471, "vq_loss_layer_023": 0.000504, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000599, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.001732, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.008484 }, { "ce_loss": 2.306116, "epoch": 0.00184, "grad_norm": 0.0036607717629522085, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.105957, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.089355, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.060501, "kv_vq_loss": 0.000833, "learning_rate": 0.000816204455752384, "loss": 0.061319, "step": 1840, "value_mse_loss_layer_000": 0.000938, "value_mse_loss_layer_001": 0.00267, "value_mse_loss_layer_002": 0.010376, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.014832, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.019897, "value_mse_loss_layer_008": 0.02356, "value_mse_loss_layer_009": 0.029541, "value_mse_loss_layer_010": 0.02771, "value_mse_loss_layer_011": 0.027222, "value_mse_loss_layer_012": 0.029541, "value_mse_loss_layer_013": 0.030029, "value_mse_loss_layer_014": 0.030762, "value_mse_loss_layer_015": 0.032715, "value_mse_loss_layer_016": 0.029785, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.038574, "value_mse_loss_layer_020": 0.039062, "value_mse_loss_layer_021": 0.046143, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.05249, "value_mse_loss_layer_024": 0.05957, "value_mse_loss_layer_025": 0.071289, "value_mse_loss_layer_026": 0.058594, "value_mse_loss_layer_027": 0.07959, "value_mse_loss_layer_028": 0.077637, "value_mse_loss_layer_029": 0.114746, "value_mse_loss_layer_030": 0.103516, "value_mse_loss_layer_031": 0.129883, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 0.000115, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000168, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000345, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000328, "vq_loss_layer_012": 0.000626, "vq_loss_layer_013": 0.000534, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.00058, "vq_loss_layer_017": 0.000446, "vq_loss_layer_018": 0.00038, "vq_loss_layer_019": 0.000267, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.00036, "vq_loss_layer_024": 0.000364, "vq_loss_layer_025": 0.000408, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000774, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.002075, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.007874 }, { "ce_loss": 2.25544, "epoch": 0.00185, "grad_norm": 0.004709223285317421, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.067871, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.09082, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.060712, "kv_vq_loss": 0.000835, "learning_rate": 0.0008167929321007533, "loss": 0.061526, "step": 1850, "value_mse_loss_layer_000": 0.000885, "value_mse_loss_layer_001": 0.002548, "value_mse_loss_layer_002": 0.010437, "value_mse_loss_layer_003": 0.01709, "value_mse_loss_layer_004": 0.015259, "value_mse_loss_layer_005": 0.015503, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.024048, "value_mse_loss_layer_009": 0.028198, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.025757, "value_mse_loss_layer_012": 0.026855, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.030762, "value_mse_loss_layer_015": 0.029541, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.030762, "value_mse_loss_layer_018": 0.034912, "value_mse_loss_layer_019": 0.039307, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.047852, "value_mse_loss_layer_022": 0.046143, "value_mse_loss_layer_023": 0.056641, "value_mse_loss_layer_024": 0.061035, "value_mse_loss_layer_025": 0.074219, "value_mse_loss_layer_026": 0.063965, "value_mse_loss_layer_027": 0.087402, "value_mse_loss_layer_028": 0.083008, "value_mse_loss_layer_029": 0.126953, "value_mse_loss_layer_030": 0.11377, "value_mse_loss_layer_031": 0.144531, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000198, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.000351, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000355, "vq_loss_layer_011": 0.000313, "vq_loss_layer_012": 0.000538, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000519, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000576, "vq_loss_layer_019": 0.000328, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000443, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000395, "vq_loss_layer_026": 0.000713, "vq_loss_layer_027": 0.000813, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.00206, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.009827 }, { "ce_loss": 2.261442, "epoch": 0.00186, "grad_norm": 0.004774907603859901, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.061368, "kv_vq_loss": 0.000861, "learning_rate": 0.0008173782360544789, "loss": 0.062225, "step": 1860, "value_mse_loss_layer_000": 0.000965, "value_mse_loss_layer_001": 0.002655, "value_mse_loss_layer_002": 0.010681, "value_mse_loss_layer_003": 0.016724, "value_mse_loss_layer_004": 0.015381, "value_mse_loss_layer_005": 0.016113, "value_mse_loss_layer_006": 0.018188, "value_mse_loss_layer_007": 0.021118, "value_mse_loss_layer_008": 0.02417, "value_mse_loss_layer_009": 0.031006, "value_mse_loss_layer_010": 0.026733, "value_mse_loss_layer_011": 0.029419, "value_mse_loss_layer_012": 0.029053, "value_mse_loss_layer_013": 0.030273, "value_mse_loss_layer_014": 0.032959, "value_mse_loss_layer_015": 0.034668, "value_mse_loss_layer_016": 0.030518, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.032959, "value_mse_loss_layer_019": 0.037598, "value_mse_loss_layer_020": 0.039307, "value_mse_loss_layer_021": 0.048096, "value_mse_loss_layer_022": 0.04541, "value_mse_loss_layer_023": 0.056396, "value_mse_loss_layer_024": 0.057373, "value_mse_loss_layer_025": 0.068848, "value_mse_loss_layer_026": 0.06543, "value_mse_loss_layer_027": 0.075684, "value_mse_loss_layer_028": 0.074219, "value_mse_loss_layer_029": 0.11084, "value_mse_loss_layer_030": 0.096191, "value_mse_loss_layer_031": 0.131836, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.000111, "vq_loss_layer_005": 0.000137, "vq_loss_layer_006": 0.000198, "vq_loss_layer_007": 0.000313, "vq_loss_layer_008": 0.000301, "vq_loss_layer_009": 0.000376, "vq_loss_layer_010": 0.000307, "vq_loss_layer_011": 0.00042, "vq_loss_layer_012": 0.000576, "vq_loss_layer_013": 0.000467, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000637, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000526, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000263, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.000357, "vq_loss_layer_023": 0.000542, "vq_loss_layer_024": 0.000404, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000942, "vq_loss_layer_027": 0.000854, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001732, "vq_loss_layer_030": 0.003433, "vq_loss_layer_031": 0.008057 }, { "ce_loss": 2.274118, "epoch": 0.00187, "grad_norm": 0.005285901483148336, "key_mse_loss_layer_000": 0.004272, "key_mse_loss_layer_001": 0.012451, "key_mse_loss_layer_002": 0.068848, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.066406, "key_mse_loss_layer_006": 0.075684, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.111328, "key_mse_loss_layer_011": 0.108398, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.111328, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.120117, "key_mse_loss_layer_019": 0.101074, "key_mse_loss_layer_020": 0.111328, "key_mse_loss_layer_021": 0.104492, "key_mse_loss_layer_022": 0.111816, "key_mse_loss_layer_023": 0.112305, "key_mse_loss_layer_024": 0.094727, "key_mse_loss_layer_025": 0.087402, "key_mse_loss_layer_026": 0.106445, "key_mse_loss_layer_027": 0.112793, "key_mse_loss_layer_028": 0.113281, "key_mse_loss_layer_029": 0.104492, "key_mse_loss_layer_030": 0.119629, "key_mse_loss_layer_031": 0.087402, "kv_mse_loss": 0.060883, "kv_vq_loss": 0.000849, "learning_rate": 0.0008179604016341247, "loss": 0.061722, "step": 1870, "value_mse_loss_layer_000": 0.000813, "value_mse_loss_layer_001": 0.002487, "value_mse_loss_layer_002": 0.010925, "value_mse_loss_layer_003": 0.01709, "value_mse_loss_layer_004": 0.016235, "value_mse_loss_layer_005": 0.015869, "value_mse_loss_layer_006": 0.017456, "value_mse_loss_layer_007": 0.020264, "value_mse_loss_layer_008": 0.022705, "value_mse_loss_layer_009": 0.029175, "value_mse_loss_layer_010": 0.025757, "value_mse_loss_layer_011": 0.026978, "value_mse_loss_layer_012": 0.030151, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.028076, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.029663, "value_mse_loss_layer_018": 0.0354, "value_mse_loss_layer_019": 0.039062, "value_mse_loss_layer_020": 0.038086, "value_mse_loss_layer_021": 0.046143, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.050781, "value_mse_loss_layer_024": 0.069824, "value_mse_loss_layer_025": 0.074219, "value_mse_loss_layer_026": 0.064453, "value_mse_loss_layer_027": 0.094238, "value_mse_loss_layer_028": 0.078613, "value_mse_loss_layer_029": 0.135742, "value_mse_loss_layer_030": 0.125, "value_mse_loss_layer_031": 0.151367, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 2.8e-05, "vq_loss_layer_002": 5.6e-05, "vq_loss_layer_003": 5.5e-05, "vq_loss_layer_004": 0.000128, "vq_loss_layer_005": 0.000159, "vq_loss_layer_006": 0.000227, "vq_loss_layer_007": 0.000328, "vq_loss_layer_008": 0.000364, "vq_loss_layer_009": 0.000484, "vq_loss_layer_010": 0.000463, "vq_loss_layer_011": 0.000414, "vq_loss_layer_012": 0.000778, "vq_loss_layer_013": 0.000473, "vq_loss_layer_014": 0.000816, "vq_loss_layer_015": 0.000614, "vq_loss_layer_016": 0.000584, "vq_loss_layer_017": 0.000538, "vq_loss_layer_018": 0.000584, "vq_loss_layer_019": 0.000349, "vq_loss_layer_020": 0.000309, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000343, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000561, "vq_loss_layer_025": 0.000862, "vq_loss_layer_026": 0.001038, "vq_loss_layer_027": 0.001404, "vq_loss_layer_028": 0.001709, "vq_loss_layer_029": 0.006073, "vq_loss_layer_030": 0.005005, "vq_loss_layer_031": 0.014099 }, { "ce_loss": 2.300503, "epoch": 0.00188, "grad_norm": 0.004524495918303728, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.06088, "kv_vq_loss": 0.000837, "learning_rate": 0.0008185394623159199, "loss": 0.06171, "step": 1880, "value_mse_loss_layer_000": 0.000957, "value_mse_loss_layer_001": 0.002625, "value_mse_loss_layer_002": 0.010376, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.014709, "value_mse_loss_layer_005": 0.015503, "value_mse_loss_layer_006": 0.020874, "value_mse_loss_layer_007": 0.019775, "value_mse_loss_layer_008": 0.02356, "value_mse_loss_layer_009": 0.03125, "value_mse_loss_layer_010": 0.027344, "value_mse_loss_layer_011": 0.030884, "value_mse_loss_layer_012": 0.030762, "value_mse_loss_layer_013": 0.03125, "value_mse_loss_layer_014": 0.032227, "value_mse_loss_layer_015": 0.035645, "value_mse_loss_layer_016": 0.031006, "value_mse_loss_layer_017": 0.033447, "value_mse_loss_layer_018": 0.033447, "value_mse_loss_layer_019": 0.037354, "value_mse_loss_layer_020": 0.037598, "value_mse_loss_layer_021": 0.051758, "value_mse_loss_layer_022": 0.044678, "value_mse_loss_layer_023": 0.056885, "value_mse_loss_layer_024": 0.057129, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.055664, "value_mse_loss_layer_027": 0.07959, "value_mse_loss_layer_028": 0.07666, "value_mse_loss_layer_029": 0.109375, "value_mse_loss_layer_030": 0.099121, "value_mse_loss_layer_031": 0.129883, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000324, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000404, "vq_loss_layer_010": 0.000338, "vq_loss_layer_011": 0.000523, "vq_loss_layer_012": 0.000671, "vq_loss_layer_013": 0.000546, "vq_loss_layer_014": 0.00061, "vq_loss_layer_015": 0.000835, "vq_loss_layer_016": 0.000633, "vq_loss_layer_017": 0.000565, "vq_loss_layer_018": 0.000326, "vq_loss_layer_019": 0.000319, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.000755, "vq_loss_layer_022": 0.000359, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.000475, "vq_loss_layer_025": 0.00045, "vq_loss_layer_026": 0.000637, "vq_loss_layer_027": 0.000957, "vq_loss_layer_028": 0.001099, "vq_loss_layer_029": 0.001846, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.007599 }, { "ce_loss": 2.303643, "epoch": 0.00189, "grad_norm": 0.006368412636220455, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.060822, "kv_vq_loss": 0.000836, "learning_rate": 0.000819115451043311, "loss": 0.061658, "step": 1890, "value_mse_loss_layer_000": 0.000957, "value_mse_loss_layer_001": 0.00264, "value_mse_loss_layer_002": 0.010376, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.01532, "value_mse_loss_layer_005": 0.015076, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.02063, "value_mse_loss_layer_008": 0.023071, "value_mse_loss_layer_009": 0.030151, "value_mse_loss_layer_010": 0.025757, "value_mse_loss_layer_011": 0.027832, "value_mse_loss_layer_012": 0.029419, "value_mse_loss_layer_013": 0.029297, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.028809, "value_mse_loss_layer_017": 0.034912, "value_mse_loss_layer_018": 0.032715, "value_mse_loss_layer_019": 0.038086, "value_mse_loss_layer_020": 0.039062, "value_mse_loss_layer_021": 0.045898, "value_mse_loss_layer_022": 0.044922, "value_mse_loss_layer_023": 0.051025, "value_mse_loss_layer_024": 0.056152, "value_mse_loss_layer_025": 0.092285, "value_mse_loss_layer_026": 0.061523, "value_mse_loss_layer_027": 0.076172, "value_mse_loss_layer_028": 0.074219, "value_mse_loss_layer_029": 0.118652, "value_mse_loss_layer_030": 0.108398, "value_mse_loss_layer_031": 0.132812, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 0.000109, "vq_loss_layer_006": 0.000188, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000355, "vq_loss_layer_010": 0.00032, "vq_loss_layer_011": 0.000357, "vq_loss_layer_012": 0.000713, "vq_loss_layer_013": 0.000492, "vq_loss_layer_014": 0.000626, "vq_loss_layer_015": 0.000641, "vq_loss_layer_016": 0.000591, "vq_loss_layer_017": 0.000679, "vq_loss_layer_018": 0.000347, "vq_loss_layer_019": 0.000296, "vq_loss_layer_020": 0.000334, "vq_loss_layer_021": 0.000626, "vq_loss_layer_022": 0.000364, "vq_loss_layer_023": 0.000433, "vq_loss_layer_024": 0.000418, "vq_loss_layer_025": 0.000763, "vq_loss_layer_026": 0.000957, "vq_loss_layer_027": 0.000809, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.002121, "vq_loss_layer_030": 0.003525, "vq_loss_layer_031": 0.00885 }, { "ce_loss": 2.247436, "epoch": 0.0019, "grad_norm": 0.004540968686342239, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.060818, "kv_vq_loss": 0.000828, "learning_rate": 0.0008196884002382071, "loss": 0.061636, "step": 1900, "value_mse_loss_layer_000": 0.000954, "value_mse_loss_layer_001": 0.002594, "value_mse_loss_layer_002": 0.011658, "value_mse_loss_layer_003": 0.016724, "value_mse_loss_layer_004": 0.01532, "value_mse_loss_layer_005": 0.015259, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.020142, "value_mse_loss_layer_008": 0.024902, "value_mse_loss_layer_009": 0.033203, "value_mse_loss_layer_010": 0.029053, "value_mse_loss_layer_011": 0.027954, "value_mse_loss_layer_012": 0.028442, "value_mse_loss_layer_013": 0.031982, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.030762, "value_mse_loss_layer_019": 0.038574, "value_mse_loss_layer_020": 0.038818, "value_mse_loss_layer_021": 0.048096, "value_mse_loss_layer_022": 0.04248, "value_mse_loss_layer_023": 0.049805, "value_mse_loss_layer_024": 0.054688, "value_mse_loss_layer_025": 0.080566, "value_mse_loss_layer_026": 0.056152, "value_mse_loss_layer_027": 0.078613, "value_mse_loss_layer_028": 0.073242, "value_mse_loss_layer_029": 0.107422, "value_mse_loss_layer_030": 0.096191, "value_mse_loss_layer_031": 0.129883, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000188, "vq_loss_layer_007": 0.000275, "vq_loss_layer_008": 0.00037, "vq_loss_layer_009": 0.000534, "vq_loss_layer_010": 0.000408, "vq_loss_layer_011": 0.000381, "vq_loss_layer_012": 0.00058, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000618, "vq_loss_layer_015": 0.00069, "vq_loss_layer_016": 0.00071, "vq_loss_layer_017": 0.000526, "vq_loss_layer_018": 0.000298, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.00066, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000471, "vq_loss_layer_024": 0.000383, "vq_loss_layer_025": 0.000656, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.001038, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.002106, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.009216 }, { "ce_loss": 2.286307, "epoch": 0.00191, "grad_norm": 0.00517389178276062, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.060858, "kv_vq_loss": 0.000835, "learning_rate": 0.0008202583418119318, "loss": 0.061688, "step": 1910, "value_mse_loss_layer_000": 0.000973, "value_mse_loss_layer_001": 0.002579, "value_mse_loss_layer_002": 0.010254, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.014648, "value_mse_loss_layer_005": 0.014954, "value_mse_loss_layer_006": 0.017944, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.023682, "value_mse_loss_layer_009": 0.03064, "value_mse_loss_layer_010": 0.027466, "value_mse_loss_layer_011": 0.029053, "value_mse_loss_layer_012": 0.030273, "value_mse_loss_layer_013": 0.030029, "value_mse_loss_layer_014": 0.031738, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.029175, "value_mse_loss_layer_017": 0.033936, "value_mse_loss_layer_018": 0.032715, "value_mse_loss_layer_019": 0.035889, "value_mse_loss_layer_020": 0.04126, "value_mse_loss_layer_021": 0.046875, "value_mse_loss_layer_022": 0.046143, "value_mse_loss_layer_023": 0.050537, "value_mse_loss_layer_024": 0.052734, "value_mse_loss_layer_025": 0.069336, "value_mse_loss_layer_026": 0.059082, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.074707, "value_mse_loss_layer_029": 0.117676, "value_mse_loss_layer_030": 0.106934, "value_mse_loss_layer_031": 0.124023, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000115, "vq_loss_layer_006": 0.000212, "vq_loss_layer_007": 0.000303, "vq_loss_layer_008": 0.000299, "vq_loss_layer_009": 0.000385, "vq_loss_layer_010": 0.000378, "vq_loss_layer_011": 0.00042, "vq_loss_layer_012": 0.00069, "vq_loss_layer_013": 0.000469, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000671, "vq_loss_layer_016": 0.000603, "vq_loss_layer_017": 0.000603, "vq_loss_layer_018": 0.00034, "vq_loss_layer_019": 0.000257, "vq_loss_layer_020": 0.000372, "vq_loss_layer_021": 0.000637, "vq_loss_layer_022": 0.000439, "vq_loss_layer_023": 0.000454, "vq_loss_layer_024": 0.000401, "vq_loss_layer_025": 0.00053, "vq_loss_layer_026": 0.000816, "vq_loss_layer_027": 0.000843, "vq_loss_layer_028": 0.001198, "vq_loss_layer_029": 0.002106, "vq_loss_layer_030": 0.00322, "vq_loss_layer_031": 0.007599 }, { "ce_loss": 2.268914, "epoch": 0.00192, "grad_norm": 0.004685443360358477, "key_mse_loss_layer_000": 0.003708, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103516, "key_mse_loss_layer_015": 0.092285, "key_mse_loss_layer_016": 0.083984, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.060577, "kv_vq_loss": 0.000841, "learning_rate": 0.0008208253071758873, "loss": 0.061401, "step": 1920, "value_mse_loss_layer_000": 0.000923, "value_mse_loss_layer_001": 0.00264, "value_mse_loss_layer_002": 0.010254, "value_mse_loss_layer_003": 0.016357, "value_mse_loss_layer_004": 0.016235, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.018311, "value_mse_loss_layer_007": 0.019775, "value_mse_loss_layer_008": 0.02417, "value_mse_loss_layer_009": 0.029663, "value_mse_loss_layer_010": 0.025757, "value_mse_loss_layer_011": 0.028198, "value_mse_loss_layer_012": 0.028442, "value_mse_loss_layer_013": 0.028687, "value_mse_loss_layer_014": 0.035889, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.028442, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.033447, "value_mse_loss_layer_019": 0.040283, "value_mse_loss_layer_020": 0.038818, "value_mse_loss_layer_021": 0.050537, "value_mse_loss_layer_022": 0.04541, "value_mse_loss_layer_023": 0.053711, "value_mse_loss_layer_024": 0.059814, "value_mse_loss_layer_025": 0.076172, "value_mse_loss_layer_026": 0.062256, "value_mse_loss_layer_027": 0.082031, "value_mse_loss_layer_028": 0.081543, "value_mse_loss_layer_029": 0.126953, "value_mse_loss_layer_030": 0.109863, "value_mse_loss_layer_031": 0.129883, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 0.000107, "vq_loss_layer_005": 0.000116, "vq_loss_layer_006": 0.000215, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000296, "vq_loss_layer_009": 0.000324, "vq_loss_layer_010": 0.000313, "vq_loss_layer_011": 0.000416, "vq_loss_layer_012": 0.000595, "vq_loss_layer_013": 0.00045, "vq_loss_layer_014": 0.000721, "vq_loss_layer_015": 0.000626, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000492, "vq_loss_layer_018": 0.000389, "vq_loss_layer_019": 0.000301, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.000614, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.000404, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.000614, "vq_loss_layer_027": 0.000778, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.002487, "vq_loss_layer_030": 0.003723, "vq_loss_layer_031": 0.007507 }, { "ce_loss": 2.292335, "epoch": 0.00193, "grad_norm": 0.004123341292142868, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.060565, "kv_vq_loss": 0.000821, "learning_rate": 0.0008213893272519434, "loss": 0.06138, "step": 1930, "value_mse_loss_layer_000": 0.000942, "value_mse_loss_layer_001": 0.002594, "value_mse_loss_layer_002": 0.010742, "value_mse_loss_layer_003": 0.019043, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.018188, "value_mse_loss_layer_007": 0.021118, "value_mse_loss_layer_008": 0.02417, "value_mse_loss_layer_009": 0.03125, "value_mse_loss_layer_010": 0.026489, "value_mse_loss_layer_011": 0.028564, "value_mse_loss_layer_012": 0.032471, "value_mse_loss_layer_013": 0.030273, "value_mse_loss_layer_014": 0.032227, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.034912, "value_mse_loss_layer_017": 0.033936, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.037842, "value_mse_loss_layer_020": 0.03833, "value_mse_loss_layer_021": 0.048096, "value_mse_loss_layer_022": 0.046143, "value_mse_loss_layer_023": 0.052246, "value_mse_loss_layer_024": 0.057373, "value_mse_loss_layer_025": 0.072754, "value_mse_loss_layer_026": 0.060059, "value_mse_loss_layer_027": 0.086914, "value_mse_loss_layer_028": 0.078125, "value_mse_loss_layer_029": 0.113281, "value_mse_loss_layer_030": 0.102051, "value_mse_loss_layer_031": 0.142578, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 6.9e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 0.000141, "vq_loss_layer_006": 0.000222, "vq_loss_layer_007": 0.000328, "vq_loss_layer_008": 0.000366, "vq_loss_layer_009": 0.000422, "vq_loss_layer_010": 0.00037, "vq_loss_layer_011": 0.000399, "vq_loss_layer_012": 0.000828, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000633, "vq_loss_layer_015": 0.000671, "vq_loss_layer_016": 0.000866, "vq_loss_layer_017": 0.00061, "vq_loss_layer_018": 0.000366, "vq_loss_layer_019": 0.000385, "vq_loss_layer_020": 0.000355, "vq_loss_layer_021": 0.000652, "vq_loss_layer_022": 0.00053, "vq_loss_layer_023": 0.00058, "vq_loss_layer_024": 0.000565, "vq_loss_layer_025": 0.000767, "vq_loss_layer_026": 0.00103, "vq_loss_layer_027": 0.001335, "vq_loss_layer_028": 0.001617, "vq_loss_layer_029": 0.002655, "vq_loss_layer_030": 0.003464, "vq_loss_layer_031": 0.011475 }, { "ce_loss": 2.268777, "epoch": 0.00194, "grad_norm": 0.006567159201949835, "key_mse_loss_layer_000": 0.003616, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.060675, "kv_vq_loss": 0.000853, "learning_rate": 0.0008219504324825564, "loss": 0.06152, "step": 1940, "value_mse_loss_layer_000": 0.000954, "value_mse_loss_layer_001": 0.002594, "value_mse_loss_layer_002": 0.010315, "value_mse_loss_layer_003": 0.017578, "value_mse_loss_layer_004": 0.016846, "value_mse_loss_layer_005": 0.015198, "value_mse_loss_layer_006": 0.017944, "value_mse_loss_layer_007": 0.02002, "value_mse_loss_layer_008": 0.02356, "value_mse_loss_layer_009": 0.030518, "value_mse_loss_layer_010": 0.030273, "value_mse_loss_layer_011": 0.028564, "value_mse_loss_layer_012": 0.03125, "value_mse_loss_layer_013": 0.030273, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.039551, "value_mse_loss_layer_016": 0.029907, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.042236, "value_mse_loss_layer_019": 0.037354, "value_mse_loss_layer_020": 0.039795, "value_mse_loss_layer_021": 0.046631, "value_mse_loss_layer_022": 0.044678, "value_mse_loss_layer_023": 0.050049, "value_mse_loss_layer_024": 0.057617, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.05957, "value_mse_loss_layer_027": 0.07666, "value_mse_loss_layer_028": 0.080566, "value_mse_loss_layer_029": 0.112793, "value_mse_loss_layer_030": 0.125977, "value_mse_loss_layer_031": 0.128906, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 4.7e-05, "vq_loss_layer_004": 0.000146, "vq_loss_layer_005": 0.000111, "vq_loss_layer_006": 0.000213, "vq_loss_layer_007": 0.000288, "vq_loss_layer_008": 0.000324, "vq_loss_layer_009": 0.000401, "vq_loss_layer_010": 0.000431, "vq_loss_layer_011": 0.000408, "vq_loss_layer_012": 0.000687, "vq_loss_layer_013": 0.00058, "vq_loss_layer_014": 0.000656, "vq_loss_layer_015": 0.000916, "vq_loss_layer_016": 0.000717, "vq_loss_layer_017": 0.000479, "vq_loss_layer_018": 0.000486, "vq_loss_layer_019": 0.000273, "vq_loss_layer_020": 0.000317, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000433, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000433, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000813, "vq_loss_layer_027": 0.000832, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.002182, "vq_loss_layer_030": 0.004272, "vq_loss_layer_031": 0.008118 }, { "ce_loss": 2.314775, "epoch": 0.00195, "grad_norm": 0.004562267567962408, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.060562, "kv_vq_loss": 0.000804, "learning_rate": 0.0008225086528406293, "loss": 0.061356, "step": 1950, "value_mse_loss_layer_000": 0.00095, "value_mse_loss_layer_001": 0.002594, "value_mse_loss_layer_002": 0.010742, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.015198, "value_mse_loss_layer_005": 0.016479, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.02063, "value_mse_loss_layer_008": 0.023804, "value_mse_loss_layer_009": 0.03064, "value_mse_loss_layer_010": 0.0271, "value_mse_loss_layer_011": 0.028931, "value_mse_loss_layer_012": 0.029785, "value_mse_loss_layer_013": 0.03064, "value_mse_loss_layer_014": 0.032471, "value_mse_loss_layer_015": 0.034424, "value_mse_loss_layer_016": 0.029175, "value_mse_loss_layer_017": 0.033203, "value_mse_loss_layer_018": 0.03125, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.037842, "value_mse_loss_layer_021": 0.057861, "value_mse_loss_layer_022": 0.044189, "value_mse_loss_layer_023": 0.048584, "value_mse_loss_layer_024": 0.050537, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.051758, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.105957, "value_mse_loss_layer_030": 0.100098, "value_mse_loss_layer_031": 0.124023, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.00014, "vq_loss_layer_006": 0.000204, "vq_loss_layer_007": 0.000301, "vq_loss_layer_008": 0.000315, "vq_loss_layer_009": 0.000391, "vq_loss_layer_010": 0.00036, "vq_loss_layer_011": 0.000431, "vq_loss_layer_012": 0.000648, "vq_loss_layer_013": 0.000504, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000648, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000538, "vq_loss_layer_018": 0.000309, "vq_loss_layer_019": 0.000263, "vq_loss_layer_020": 0.000328, "vq_loss_layer_021": 0.000977, "vq_loss_layer_022": 0.000439, "vq_loss_layer_023": 0.000479, "vq_loss_layer_024": 0.000418, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.000614, "vq_loss_layer_027": 0.000771, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001846, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.007629 }, { "ce_loss": 2.313882, "epoch": 0.00196, "grad_norm": 0.004201068077236414, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.095215, "key_mse_loss_layer_009": 0.101074, "key_mse_loss_layer_010": 0.114746, "key_mse_loss_layer_011": 0.109375, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.142578, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.12793, "key_mse_loss_layer_016": 0.120605, "key_mse_loss_layer_017": 0.117188, "key_mse_loss_layer_018": 0.125977, "key_mse_loss_layer_019": 0.099609, "key_mse_loss_layer_020": 0.116211, "key_mse_loss_layer_021": 0.111328, "key_mse_loss_layer_022": 0.117188, "key_mse_loss_layer_023": 0.114258, "key_mse_loss_layer_024": 0.09082, "key_mse_loss_layer_025": 0.085938, "key_mse_loss_layer_026": 0.102051, "key_mse_loss_layer_027": 0.103027, "key_mse_loss_layer_028": 0.107422, "key_mse_loss_layer_029": 0.097168, "key_mse_loss_layer_030": 0.104004, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.060422, "kv_vq_loss": 0.000803, "learning_rate": 0.0008230640178391188, "loss": 0.061203, "step": 1960, "value_mse_loss_layer_000": 0.000942, "value_mse_loss_layer_001": 0.002762, "value_mse_loss_layer_002": 0.010254, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.016357, "value_mse_loss_layer_005": 0.015869, "value_mse_loss_layer_006": 0.019531, "value_mse_loss_layer_007": 0.021118, "value_mse_loss_layer_008": 0.024658, "value_mse_loss_layer_009": 0.030029, "value_mse_loss_layer_010": 0.026489, "value_mse_loss_layer_011": 0.027954, "value_mse_loss_layer_012": 0.028442, "value_mse_loss_layer_013": 0.029785, "value_mse_loss_layer_014": 0.03125, "value_mse_loss_layer_015": 0.031494, "value_mse_loss_layer_016": 0.027466, "value_mse_loss_layer_017": 0.031128, "value_mse_loss_layer_018": 0.030884, "value_mse_loss_layer_019": 0.036133, "value_mse_loss_layer_020": 0.036865, "value_mse_loss_layer_021": 0.04834, "value_mse_loss_layer_022": 0.042725, "value_mse_loss_layer_023": 0.049561, "value_mse_loss_layer_024": 0.054199, "value_mse_loss_layer_025": 0.067383, "value_mse_loss_layer_026": 0.055908, "value_mse_loss_layer_027": 0.080078, "value_mse_loss_layer_028": 0.078613, "value_mse_loss_layer_029": 0.119629, "value_mse_loss_layer_030": 0.104492, "value_mse_loss_layer_031": 0.134766, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 4.5e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 6.1e-05, "vq_loss_layer_004": 0.000125, "vq_loss_layer_005": 0.000148, "vq_loss_layer_006": 0.000286, "vq_loss_layer_007": 0.00034, "vq_loss_layer_008": 0.000469, "vq_loss_layer_009": 0.000462, "vq_loss_layer_010": 0.000469, "vq_loss_layer_011": 0.000458, "vq_loss_layer_012": 0.000698, "vq_loss_layer_013": 0.000557, "vq_loss_layer_014": 0.000805, "vq_loss_layer_015": 0.00074, "vq_loss_layer_016": 0.000751, "vq_loss_layer_017": 0.000607, "vq_loss_layer_018": 0.00036, "vq_loss_layer_019": 0.000347, "vq_loss_layer_020": 0.000399, "vq_loss_layer_021": 0.000969, "vq_loss_layer_022": 0.000475, "vq_loss_layer_023": 0.000523, "vq_loss_layer_024": 0.000549, "vq_loss_layer_025": 0.000729, "vq_loss_layer_026": 0.000946, "vq_loss_layer_027": 0.001183, "vq_loss_layer_028": 0.002014, "vq_loss_layer_029": 0.003296, "vq_loss_layer_030": 0.004028, "vq_loss_layer_031": 0.011353 }, { "ce_loss": 2.309288, "epoch": 0.00197, "grad_norm": 0.005016332026571035, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.060056, "kv_vq_loss": 0.000788, "learning_rate": 0.0008236165565403982, "loss": 0.060843, "step": 1970, "value_mse_loss_layer_000": 0.000908, "value_mse_loss_layer_001": 0.002563, "value_mse_loss_layer_002": 0.010071, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.014587, "value_mse_loss_layer_005": 0.014343, "value_mse_loss_layer_006": 0.017822, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.022827, "value_mse_loss_layer_009": 0.03125, "value_mse_loss_layer_010": 0.029297, "value_mse_loss_layer_011": 0.028564, "value_mse_loss_layer_012": 0.029419, "value_mse_loss_layer_013": 0.031494, "value_mse_loss_layer_014": 0.032227, "value_mse_loss_layer_015": 0.034912, "value_mse_loss_layer_016": 0.029419, "value_mse_loss_layer_017": 0.033936, "value_mse_loss_layer_018": 0.031738, "value_mse_loss_layer_019": 0.041504, "value_mse_loss_layer_020": 0.043945, "value_mse_loss_layer_021": 0.046143, "value_mse_loss_layer_022": 0.045654, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.056885, "value_mse_loss_layer_025": 0.06543, "value_mse_loss_layer_026": 0.066895, "value_mse_loss_layer_027": 0.075195, "value_mse_loss_layer_028": 0.071777, "value_mse_loss_layer_029": 0.104492, "value_mse_loss_layer_030": 0.096191, "value_mse_loss_layer_031": 0.121094, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 9.6e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000196, "vq_loss_layer_007": 0.000278, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000423, "vq_loss_layer_010": 0.000357, "vq_loss_layer_011": 0.000385, "vq_loss_layer_012": 0.000633, "vq_loss_layer_013": 0.000515, "vq_loss_layer_014": 0.000599, "vq_loss_layer_015": 0.000652, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000576, "vq_loss_layer_018": 0.000328, "vq_loss_layer_019": 0.000313, "vq_loss_layer_020": 0.000385, "vq_loss_layer_021": 0.000629, "vq_loss_layer_022": 0.000465, "vq_loss_layer_023": 0.000519, "vq_loss_layer_024": 0.000471, "vq_loss_layer_025": 0.000452, "vq_loss_layer_026": 0.001091, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001678, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.006958 }, { "ce_loss": 2.298821, "epoch": 0.00198, "grad_norm": 0.00637655146420002, "key_mse_loss_layer_000": 0.003845, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.060333, "kv_vq_loss": 0.000796, "learning_rate": 0.0008241662975653827, "loss": 0.06113, "step": 1980, "value_mse_loss_layer_000": 0.00095, "value_mse_loss_layer_001": 0.002625, "value_mse_loss_layer_002": 0.010864, "value_mse_loss_layer_003": 0.016724, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.015625, "value_mse_loss_layer_006": 0.017456, "value_mse_loss_layer_007": 0.020752, "value_mse_loss_layer_008": 0.023438, "value_mse_loss_layer_009": 0.029663, "value_mse_loss_layer_010": 0.025513, "value_mse_loss_layer_011": 0.027344, "value_mse_loss_layer_012": 0.029297, "value_mse_loss_layer_013": 0.029053, "value_mse_loss_layer_014": 0.03125, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.029053, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.031494, "value_mse_loss_layer_019": 0.036865, "value_mse_loss_layer_020": 0.037354, "value_mse_loss_layer_021": 0.046631, "value_mse_loss_layer_022": 0.043701, "value_mse_loss_layer_023": 0.052246, "value_mse_loss_layer_024": 0.056641, "value_mse_loss_layer_025": 0.073242, "value_mse_loss_layer_026": 0.060791, "value_mse_loss_layer_027": 0.077637, "value_mse_loss_layer_028": 0.078125, "value_mse_loss_layer_029": 0.119629, "value_mse_loss_layer_030": 0.122559, "value_mse_loss_layer_031": 0.133789, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 9.9e-05, "vq_loss_layer_005": 0.000124, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000324, "vq_loss_layer_008": 0.000336, "vq_loss_layer_009": 0.000401, "vq_loss_layer_010": 0.000341, "vq_loss_layer_011": 0.000385, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.000492, "vq_loss_layer_014": 0.000645, "vq_loss_layer_015": 0.000645, "vq_loss_layer_016": 0.000664, "vq_loss_layer_017": 0.00066, "vq_loss_layer_018": 0.000326, "vq_loss_layer_019": 0.000286, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.000626, "vq_loss_layer_022": 0.000374, "vq_loss_layer_023": 0.000462, "vq_loss_layer_024": 0.000448, "vq_loss_layer_025": 0.00061, "vq_loss_layer_026": 0.000912, "vq_loss_layer_027": 0.000931, "vq_loss_layer_028": 0.001419, "vq_loss_layer_029": 0.002274, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.009338 }, { "ce_loss": 2.285342, "epoch": 0.00199, "grad_norm": 0.0053199962712824345, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.060349, "kv_vq_loss": 0.00082, "learning_rate": 0.0008247132691024266, "loss": 0.061157, "step": 1990, "value_mse_loss_layer_000": 0.000946, "value_mse_loss_layer_001": 0.002563, "value_mse_loss_layer_002": 0.010193, "value_mse_loss_layer_003": 0.016479, "value_mse_loss_layer_004": 0.014771, "value_mse_loss_layer_005": 0.015503, "value_mse_loss_layer_006": 0.018066, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.022949, "value_mse_loss_layer_009": 0.030884, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.027954, "value_mse_loss_layer_012": 0.028442, "value_mse_loss_layer_013": 0.029663, "value_mse_loss_layer_014": 0.035645, "value_mse_loss_layer_015": 0.034424, "value_mse_loss_layer_016": 0.030884, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.036133, "value_mse_loss_layer_020": 0.03833, "value_mse_loss_layer_021": 0.046875, "value_mse_loss_layer_022": 0.044922, "value_mse_loss_layer_023": 0.048096, "value_mse_loss_layer_024": 0.057617, "value_mse_loss_layer_025": 0.071289, "value_mse_loss_layer_026": 0.054932, "value_mse_loss_layer_027": 0.078613, "value_mse_loss_layer_028": 0.074219, "value_mse_loss_layer_029": 0.112305, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.12793, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 0.000129, "vq_loss_layer_006": 0.000206, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000277, "vq_loss_layer_009": 0.000422, "vq_loss_layer_010": 0.000303, "vq_loss_layer_011": 0.000397, "vq_loss_layer_012": 0.000584, "vq_loss_layer_013": 0.000463, "vq_loss_layer_014": 0.000713, "vq_loss_layer_015": 0.000645, "vq_loss_layer_016": 0.000637, "vq_loss_layer_017": 0.0005, "vq_loss_layer_018": 0.000334, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000313, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000387, "vq_loss_layer_023": 0.000387, "vq_loss_layer_024": 0.000402, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000893, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001923, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.007812 }, { "ce_loss": 2.311509, "epoch": 0.002, "grad_norm": 0.005792645271867514, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.060449, "kv_vq_loss": 0.000795, "learning_rate": 0.0008252574989159952, "loss": 0.061237, "step": 2000, "value_mse_loss_layer_000": 0.000927, "value_mse_loss_layer_001": 0.002579, "value_mse_loss_layer_002": 0.010193, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.014587, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.02002, "value_mse_loss_layer_008": 0.02356, "value_mse_loss_layer_009": 0.031006, "value_mse_loss_layer_010": 0.02771, "value_mse_loss_layer_011": 0.02832, "value_mse_loss_layer_012": 0.031982, "value_mse_loss_layer_013": 0.030762, "value_mse_loss_layer_014": 0.032227, "value_mse_loss_layer_015": 0.034668, "value_mse_loss_layer_016": 0.028931, "value_mse_loss_layer_017": 0.036133, "value_mse_loss_layer_018": 0.033691, "value_mse_loss_layer_019": 0.038818, "value_mse_loss_layer_020": 0.039795, "value_mse_loss_layer_021": 0.044922, "value_mse_loss_layer_022": 0.044189, "value_mse_loss_layer_023": 0.059814, "value_mse_loss_layer_024": 0.056396, "value_mse_loss_layer_025": 0.06543, "value_mse_loss_layer_026": 0.053711, "value_mse_loss_layer_027": 0.071289, "value_mse_loss_layer_028": 0.075195, "value_mse_loss_layer_029": 0.120117, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.119629, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 0.000155, "vq_loss_layer_005": 0.000111, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.000305, "vq_loss_layer_009": 0.000404, "vq_loss_layer_010": 0.000338, "vq_loss_layer_011": 0.000364, "vq_loss_layer_012": 0.00087, "vq_loss_layer_013": 0.000496, "vq_loss_layer_014": 0.000591, "vq_loss_layer_015": 0.000629, "vq_loss_layer_016": 0.000572, "vq_loss_layer_017": 0.000648, "vq_loss_layer_018": 0.000374, "vq_loss_layer_019": 0.000288, "vq_loss_layer_020": 0.000364, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000366, "vq_loss_layer_023": 0.000595, "vq_loss_layer_024": 0.000439, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001869, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.006714 }, { "ce_loss": 2.282508, "epoch": 0.00201, "grad_norm": 0.005222219508141279, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.060129, "kv_vq_loss": 0.000792, "learning_rate": 0.0008257990143551221, "loss": 0.060919, "step": 2010, "value_mse_loss_layer_000": 0.000919, "value_mse_loss_layer_001": 0.002563, "value_mse_loss_layer_002": 0.011292, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.014282, "value_mse_loss_layer_005": 0.014282, "value_mse_loss_layer_006": 0.018188, "value_mse_loss_layer_007": 0.019287, "value_mse_loss_layer_008": 0.023193, "value_mse_loss_layer_009": 0.029541, "value_mse_loss_layer_010": 0.025513, "value_mse_loss_layer_011": 0.028198, "value_mse_loss_layer_012": 0.031982, "value_mse_loss_layer_013": 0.029541, "value_mse_loss_layer_014": 0.030518, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.031494, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.037598, "value_mse_loss_layer_021": 0.047852, "value_mse_loss_layer_022": 0.042969, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.053223, "value_mse_loss_layer_025": 0.067383, "value_mse_loss_layer_026": 0.061279, "value_mse_loss_layer_027": 0.073242, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.115234, "value_mse_loss_layer_030": 0.09668, "value_mse_loss_layer_031": 0.118652, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000223, "vq_loss_layer_007": 0.000278, "vq_loss_layer_008": 0.00028, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.00038, "vq_loss_layer_012": 0.00087, "vq_loss_layer_013": 0.000441, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.00066, "vq_loss_layer_017": 0.000475, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000587, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000488, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000408, "vq_loss_layer_026": 0.000782, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.001564, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.006439 }, { "ce_loss": 2.254658, "epoch": 0.00202, "grad_norm": 0.004919222556054592, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.060721, "kv_vq_loss": 0.000849, "learning_rate": 0.0008263378423616558, "loss": 0.061548, "step": 2020, "value_mse_loss_layer_000": 0.000961, "value_mse_loss_layer_001": 0.002609, "value_mse_loss_layer_002": 0.01062, "value_mse_loss_layer_003": 0.01709, "value_mse_loss_layer_004": 0.015747, "value_mse_loss_layer_005": 0.017334, "value_mse_loss_layer_006": 0.018311, "value_mse_loss_layer_007": 0.021606, "value_mse_loss_layer_008": 0.024292, "value_mse_loss_layer_009": 0.031982, "value_mse_loss_layer_010": 0.027588, "value_mse_loss_layer_011": 0.030029, "value_mse_loss_layer_012": 0.032227, "value_mse_loss_layer_013": 0.032959, "value_mse_loss_layer_014": 0.035156, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.030151, "value_mse_loss_layer_017": 0.033203, "value_mse_loss_layer_018": 0.031494, "value_mse_loss_layer_019": 0.039062, "value_mse_loss_layer_020": 0.037598, "value_mse_loss_layer_021": 0.046387, "value_mse_loss_layer_022": 0.041748, "value_mse_loss_layer_023": 0.048096, "value_mse_loss_layer_024": 0.052979, "value_mse_loss_layer_025": 0.068848, "value_mse_loss_layer_026": 0.053711, "value_mse_loss_layer_027": 0.074707, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.105957, "value_mse_loss_layer_030": 0.111816, "value_mse_loss_layer_031": 0.128906, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 3e-05, "vq_loss_layer_002": 3.7e-05, "vq_loss_layer_003": 6e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.000194, "vq_loss_layer_006": 0.000252, "vq_loss_layer_007": 0.000317, "vq_loss_layer_008": 0.00041, "vq_loss_layer_009": 0.000465, "vq_loss_layer_010": 0.000452, "vq_loss_layer_011": 0.0005, "vq_loss_layer_012": 0.000729, "vq_loss_layer_013": 0.000736, "vq_loss_layer_014": 0.000927, "vq_loss_layer_015": 0.000809, "vq_loss_layer_016": 0.000744, "vq_loss_layer_017": 0.000656, "vq_loss_layer_018": 0.000397, "vq_loss_layer_019": 0.000383, "vq_loss_layer_020": 0.000433, "vq_loss_layer_021": 0.000805, "vq_loss_layer_022": 0.000561, "vq_loss_layer_023": 0.000626, "vq_loss_layer_024": 0.000595, "vq_loss_layer_025": 0.000984, "vq_loss_layer_026": 0.001045, "vq_loss_layer_027": 0.001282, "vq_loss_layer_028": 0.001564, "vq_loss_layer_029": 0.002655, "vq_loss_layer_030": 0.004883, "vq_loss_layer_031": 0.010742 }, { "ce_loss": 2.25211, "epoch": 0.00203, "grad_norm": 0.006159435957670212, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.060315, "kv_vq_loss": 0.000796, "learning_rate": 0.0008268740094783031, "loss": 0.061096, "step": 2030, "value_mse_loss_layer_000": 0.000931, "value_mse_loss_layer_001": 0.002579, "value_mse_loss_layer_002": 0.010071, "value_mse_loss_layer_003": 0.016357, "value_mse_loss_layer_004": 0.015869, "value_mse_loss_layer_005": 0.015259, "value_mse_loss_layer_006": 0.018799, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.023438, "value_mse_loss_layer_009": 0.030029, "value_mse_loss_layer_010": 0.028076, "value_mse_loss_layer_011": 0.02832, "value_mse_loss_layer_012": 0.029419, "value_mse_loss_layer_013": 0.029297, "value_mse_loss_layer_014": 0.032471, "value_mse_loss_layer_015": 0.03418, "value_mse_loss_layer_016": 0.028809, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.031982, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.036865, "value_mse_loss_layer_021": 0.046143, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.054688, "value_mse_loss_layer_025": 0.070312, "value_mse_loss_layer_026": 0.056396, "value_mse_loss_layer_027": 0.073242, "value_mse_loss_layer_028": 0.077637, "value_mse_loss_layer_029": 0.117676, "value_mse_loss_layer_030": 0.102539, "value_mse_loss_layer_031": 0.122559, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 0.000112, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000237, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000313, "vq_loss_layer_009": 0.000385, "vq_loss_layer_010": 0.000378, "vq_loss_layer_011": 0.000376, "vq_loss_layer_012": 0.00061, "vq_loss_layer_013": 0.00046, "vq_loss_layer_014": 0.00066, "vq_loss_layer_015": 0.00071, "vq_loss_layer_016": 0.000576, "vq_loss_layer_017": 0.000694, "vq_loss_layer_018": 0.00029, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000587, "vq_loss_layer_022": 0.000381, "vq_loss_layer_023": 0.00042, "vq_loss_layer_024": 0.000492, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000774, "vq_loss_layer_027": 0.000813, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.001892, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.007141 }, { "ce_loss": 2.259426, "epoch": 0.00204, "grad_norm": 0.00441405363380909, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.104004, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.060019, "kv_vq_loss": 0.000796, "learning_rate": 0.0008274075418564746, "loss": 0.060812, "step": 2040, "value_mse_loss_layer_000": 0.000931, "value_mse_loss_layer_001": 0.002579, "value_mse_loss_layer_002": 0.010254, "value_mse_loss_layer_003": 0.016357, "value_mse_loss_layer_004": 0.01532, "value_mse_loss_layer_005": 0.015137, "value_mse_loss_layer_006": 0.018066, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.022827, "value_mse_loss_layer_009": 0.029663, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.026733, "value_mse_loss_layer_012": 0.027222, "value_mse_loss_layer_013": 0.029053, "value_mse_loss_layer_014": 0.030884, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.032959, "value_mse_loss_layer_017": 0.031006, "value_mse_loss_layer_018": 0.030518, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.040039, "value_mse_loss_layer_021": 0.049072, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.052246, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.06543, "value_mse_loss_layer_026": 0.052734, "value_mse_loss_layer_027": 0.070312, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.107422, "value_mse_loss_layer_030": 0.09668, "value_mse_loss_layer_031": 0.125977, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 9.4e-05, "vq_loss_layer_005": 0.000109, "vq_loss_layer_006": 0.000212, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000313, "vq_loss_layer_009": 0.00042, "vq_loss_layer_010": 0.000332, "vq_loss_layer_011": 0.000404, "vq_loss_layer_012": 0.00061, "vq_loss_layer_013": 0.000484, "vq_loss_layer_014": 0.000713, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000813, "vq_loss_layer_017": 0.000479, "vq_loss_layer_018": 0.000303, "vq_loss_layer_019": 0.000277, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.000896, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000542, "vq_loss_layer_024": 0.00041, "vq_loss_layer_025": 0.000561, "vq_loss_layer_026": 0.000751, "vq_loss_layer_027": 0.000793, "vq_loss_layer_028": 0.001312, "vq_loss_layer_029": 0.0019, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.008972 }, { "ce_loss": 2.317368, "epoch": 0.00205, "grad_norm": 0.006534356623888016, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.060681, "kv_vq_loss": 0.000806, "learning_rate": 0.0008279384652639384, "loss": 0.061499, "step": 2050, "value_mse_loss_layer_000": 0.000908, "value_mse_loss_layer_001": 0.002518, "value_mse_loss_layer_002": 0.010315, "value_mse_loss_layer_003": 0.018311, "value_mse_loss_layer_004": 0.014282, "value_mse_loss_layer_005": 0.014404, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.021118, "value_mse_loss_layer_008": 0.023926, "value_mse_loss_layer_009": 0.029907, "value_mse_loss_layer_010": 0.025269, "value_mse_loss_layer_011": 0.028076, "value_mse_loss_layer_012": 0.028809, "value_mse_loss_layer_013": 0.03064, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.032715, "value_mse_loss_layer_016": 0.029297, "value_mse_loss_layer_017": 0.033936, "value_mse_loss_layer_018": 0.034668, "value_mse_loss_layer_019": 0.037109, "value_mse_loss_layer_020": 0.038574, "value_mse_loss_layer_021": 0.048096, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.052002, "value_mse_loss_layer_024": 0.05835, "value_mse_loss_layer_025": 0.071289, "value_mse_loss_layer_026": 0.054443, "value_mse_loss_layer_027": 0.077637, "value_mse_loss_layer_028": 0.079102, "value_mse_loss_layer_029": 0.118652, "value_mse_loss_layer_030": 0.107422, "value_mse_loss_layer_031": 0.129883, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000178, "vq_loss_layer_007": 0.000322, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.000404, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000389, "vq_loss_layer_012": 0.00061, "vq_loss_layer_013": 0.000622, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000618, "vq_loss_layer_017": 0.000576, "vq_loss_layer_018": 0.000372, "vq_loss_layer_019": 0.000261, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000633, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000399, "vq_loss_layer_024": 0.000429, "vq_loss_layer_025": 0.00042, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000744, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.001839, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.00708 }, { "ce_loss": 2.285838, "epoch": 0.00206, "grad_norm": 0.005149966571480036, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.060489, "kv_vq_loss": 0.000801, "learning_rate": 0.0008284668050922883, "loss": 0.061279, "step": 2060, "value_mse_loss_layer_000": 0.000923, "value_mse_loss_layer_001": 0.002594, "value_mse_loss_layer_002": 0.010071, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.015259, "value_mse_loss_layer_005": 0.014893, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.019409, "value_mse_loss_layer_008": 0.022339, "value_mse_loss_layer_009": 0.030884, "value_mse_loss_layer_010": 0.026733, "value_mse_loss_layer_011": 0.026611, "value_mse_loss_layer_012": 0.027832, "value_mse_loss_layer_013": 0.029053, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.027954, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.030151, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.042236, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.052246, "value_mse_loss_layer_025": 0.064941, "value_mse_loss_layer_026": 0.075195, "value_mse_loss_layer_027": 0.072266, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.115234, "value_mse_loss_layer_030": 0.097168, "value_mse_loss_layer_031": 0.123047, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 0.000123, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000186, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000399, "vq_loss_layer_010": 0.000357, "vq_loss_layer_011": 0.000357, "vq_loss_layer_012": 0.000607, "vq_loss_layer_013": 0.000469, "vq_loss_layer_014": 0.00066, "vq_loss_layer_015": 0.000587, "vq_loss_layer_016": 0.00061, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.000286, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.00066, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000425, "vq_loss_layer_024": 0.000401, "vq_loss_layer_025": 0.000546, "vq_loss_layer_026": 0.001602, "vq_loss_layer_027": 0.000874, "vq_loss_layer_028": 0.001274, "vq_loss_layer_029": 0.001968, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.008606 }, { "ce_loss": 2.262446, "epoch": 0.00207, "grad_norm": 0.004236327949911356, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.060291, "kv_vq_loss": 0.000805, "learning_rate": 0.0008289925863642294, "loss": 0.061096, "step": 2070, "value_mse_loss_layer_000": 0.000938, "value_mse_loss_layer_001": 0.002563, "value_mse_loss_layer_002": 0.01123, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.014832, "value_mse_loss_layer_005": 0.014893, "value_mse_loss_layer_006": 0.018311, "value_mse_loss_layer_007": 0.019897, "value_mse_loss_layer_008": 0.024048, "value_mse_loss_layer_009": 0.030151, "value_mse_loss_layer_010": 0.026733, "value_mse_loss_layer_011": 0.027954, "value_mse_loss_layer_012": 0.029785, "value_mse_loss_layer_013": 0.030029, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.03418, "value_mse_loss_layer_016": 0.029785, "value_mse_loss_layer_017": 0.033203, "value_mse_loss_layer_018": 0.030762, "value_mse_loss_layer_019": 0.036865, "value_mse_loss_layer_020": 0.039795, "value_mse_loss_layer_021": 0.04541, "value_mse_loss_layer_022": 0.042725, "value_mse_loss_layer_023": 0.051514, "value_mse_loss_layer_024": 0.051514, "value_mse_loss_layer_025": 0.06543, "value_mse_loss_layer_026": 0.058105, "value_mse_loss_layer_027": 0.068848, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.102051, "value_mse_loss_layer_030": 0.094238, "value_mse_loss_layer_031": 0.126953, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 9.6e-05, "vq_loss_layer_005": 0.000111, "vq_loss_layer_006": 0.000212, "vq_loss_layer_007": 0.000271, "vq_loss_layer_008": 0.000336, "vq_loss_layer_009": 0.00033, "vq_loss_layer_010": 0.000332, "vq_loss_layer_011": 0.000341, "vq_loss_layer_012": 0.00066, "vq_loss_layer_013": 0.000446, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000771, "vq_loss_layer_016": 0.000572, "vq_loss_layer_017": 0.000484, "vq_loss_layer_018": 0.000278, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000332, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000444, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000414, "vq_loss_layer_026": 0.00087, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.000931, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.007599 }, { "ce_loss": 2.302996, "epoch": 0.00208, "grad_norm": 0.0057539367116987705, "key_mse_loss_layer_000": 0.002563, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.043701, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.123535, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.059326, "kv_mse_loss": 0.059973, "kv_vq_loss": 0.000826, "learning_rate": 0.0008295158337406903, "loss": 0.060809, "step": 2080, "value_mse_loss_layer_000": 0.000916, "value_mse_loss_layer_001": 0.002502, "value_mse_loss_layer_002": 0.010376, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.014832, "value_mse_loss_layer_005": 0.0177, "value_mse_loss_layer_006": 0.017578, "value_mse_loss_layer_007": 0.02002, "value_mse_loss_layer_008": 0.022949, "value_mse_loss_layer_009": 0.032227, "value_mse_loss_layer_010": 0.026001, "value_mse_loss_layer_011": 0.027954, "value_mse_loss_layer_012": 0.02771, "value_mse_loss_layer_013": 0.029785, "value_mse_loss_layer_014": 0.032959, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.027344, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.030273, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.058838, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.043701, "value_mse_loss_layer_024": 0.051514, "value_mse_loss_layer_025": 0.065918, "value_mse_loss_layer_026": 0.052002, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.094238, "value_mse_loss_layer_030": 0.088379, "value_mse_loss_layer_031": 0.118164, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.4e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 5.7e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000205, "vq_loss_layer_006": 0.000207, "vq_loss_layer_007": 0.000296, "vq_loss_layer_008": 0.000357, "vq_loss_layer_009": 0.000542, "vq_loss_layer_010": 0.000406, "vq_loss_layer_011": 0.000448, "vq_loss_layer_012": 0.000629, "vq_loss_layer_013": 0.000496, "vq_loss_layer_014": 0.000805, "vq_loss_layer_015": 0.000668, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000568, "vq_loss_layer_018": 0.000353, "vq_loss_layer_019": 0.000305, "vq_loss_layer_020": 0.000341, "vq_loss_layer_021": 0.001251, "vq_loss_layer_022": 0.000423, "vq_loss_layer_023": 0.000479, "vq_loss_layer_024": 0.000526, "vq_loss_layer_025": 0.000824, "vq_loss_layer_026": 0.000874, "vq_loss_layer_027": 0.000992, "vq_loss_layer_028": 0.001656, "vq_loss_layer_029": 0.001945, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.009216 }, { "ce_loss": 2.244032, "epoch": 0.00209, "grad_norm": 0.004911825060844421, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.060803, "kv_vq_loss": 0.000818, "learning_rate": 0.0008300365715277633, "loss": 0.061624, "step": 2090, "value_mse_loss_layer_000": 0.000919, "value_mse_loss_layer_001": 0.002609, "value_mse_loss_layer_002": 0.01062, "value_mse_loss_layer_003": 0.015747, "value_mse_loss_layer_004": 0.014587, "value_mse_loss_layer_005": 0.015503, "value_mse_loss_layer_006": 0.017456, "value_mse_loss_layer_007": 0.019409, "value_mse_loss_layer_008": 0.023315, "value_mse_loss_layer_009": 0.03125, "value_mse_loss_layer_010": 0.0271, "value_mse_loss_layer_011": 0.02771, "value_mse_loss_layer_012": 0.02832, "value_mse_loss_layer_013": 0.029419, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.030518, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.032227, "value_mse_loss_layer_019": 0.043945, "value_mse_loss_layer_020": 0.039551, "value_mse_loss_layer_021": 0.053711, "value_mse_loss_layer_022": 0.043457, "value_mse_loss_layer_023": 0.048096, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.064941, "value_mse_loss_layer_026": 0.054932, "value_mse_loss_layer_027": 0.072754, "value_mse_loss_layer_028": 0.072754, "value_mse_loss_layer_029": 0.112793, "value_mse_loss_layer_030": 0.095703, "value_mse_loss_layer_031": 0.118164, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 0.000135, "vq_loss_layer_006": 0.000187, "vq_loss_layer_007": 0.000286, "vq_loss_layer_008": 0.00028, "vq_loss_layer_009": 0.000433, "vq_loss_layer_010": 0.000334, "vq_loss_layer_011": 0.000393, "vq_loss_layer_012": 0.000603, "vq_loss_layer_013": 0.000496, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000599, "vq_loss_layer_017": 0.000523, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000324, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.000683, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.00038, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000786, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.001762, "vq_loss_layer_030": 0.002838, "vq_loss_layer_031": 0.006744 }, { "ce_loss": 2.289284, "epoch": 0.0021, "grad_norm": 0.005617267917841673, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.063965, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.092773, "key_mse_loss_layer_009": 0.097168, "key_mse_loss_layer_010": 0.109863, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.114258, "key_mse_loss_layer_017": 0.109375, "key_mse_loss_layer_018": 0.123535, "key_mse_loss_layer_019": 0.099121, "key_mse_loss_layer_020": 0.112305, "key_mse_loss_layer_021": 0.10498, "key_mse_loss_layer_022": 0.112793, "key_mse_loss_layer_023": 0.116211, "key_mse_loss_layer_024": 0.094727, "key_mse_loss_layer_025": 0.089844, "key_mse_loss_layer_026": 0.105469, "key_mse_loss_layer_027": 0.111328, "key_mse_loss_layer_028": 0.114746, "key_mse_loss_layer_029": 0.106445, "key_mse_loss_layer_030": 0.115723, "key_mse_loss_layer_031": 0.086914, "kv_mse_loss": 0.060336, "kv_vq_loss": 0.000837, "learning_rate": 0.0008305548236834797, "loss": 0.061163, "step": 2100, "value_mse_loss_layer_000": 0.000889, "value_mse_loss_layer_001": 0.002548, "value_mse_loss_layer_002": 0.010315, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.015198, "value_mse_loss_layer_005": 0.014893, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.020996, "value_mse_loss_layer_008": 0.024048, "value_mse_loss_layer_009": 0.027344, "value_mse_loss_layer_010": 0.023804, "value_mse_loss_layer_011": 0.025757, "value_mse_loss_layer_012": 0.027588, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.030029, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.033203, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.041992, "value_mse_loss_layer_021": 0.044189, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.068359, "value_mse_loss_layer_024": 0.054932, "value_mse_loss_layer_025": 0.069336, "value_mse_loss_layer_026": 0.069824, "value_mse_loss_layer_027": 0.081055, "value_mse_loss_layer_028": 0.078125, "value_mse_loss_layer_029": 0.123047, "value_mse_loss_layer_030": 0.113281, "value_mse_loss_layer_031": 0.154297, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 2.8e-05, "vq_loss_layer_002": 4e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 0.000111, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000246, "vq_loss_layer_007": 0.000381, "vq_loss_layer_008": 0.000481, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.000376, "vq_loss_layer_011": 0.000372, "vq_loss_layer_012": 0.000706, "vq_loss_layer_013": 0.00053, "vq_loss_layer_014": 0.000637, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.00069, "vq_loss_layer_019": 0.00036, "vq_loss_layer_020": 0.000341, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000402, "vq_loss_layer_023": 0.000679, "vq_loss_layer_024": 0.000374, "vq_loss_layer_025": 0.000679, "vq_loss_layer_026": 0.001549, "vq_loss_layer_027": 0.000919, "vq_loss_layer_028": 0.001541, "vq_loss_layer_029": 0.003876, "vq_loss_layer_030": 0.003845, "vq_loss_layer_031": 0.013977 }, { "ce_loss": 2.301955, "epoch": 0.00211, "grad_norm": 0.0043686311691999435, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.060126, "kv_vq_loss": 0.000786, "learning_rate": 0.0008310706138244231, "loss": 0.060907, "step": 2110, "value_mse_loss_layer_000": 0.000912, "value_mse_loss_layer_001": 0.002533, "value_mse_loss_layer_002": 0.010071, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.014038, "value_mse_loss_layer_005": 0.014404, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.019653, "value_mse_loss_layer_008": 0.022827, "value_mse_loss_layer_009": 0.029541, "value_mse_loss_layer_010": 0.026245, "value_mse_loss_layer_011": 0.028931, "value_mse_loss_layer_012": 0.029785, "value_mse_loss_layer_013": 0.030273, "value_mse_loss_layer_014": 0.031738, "value_mse_loss_layer_015": 0.033936, "value_mse_loss_layer_016": 0.030273, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.03125, "value_mse_loss_layer_019": 0.036621, "value_mse_loss_layer_020": 0.040283, "value_mse_loss_layer_021": 0.046143, "value_mse_loss_layer_022": 0.044189, "value_mse_loss_layer_023": 0.049805, "value_mse_loss_layer_024": 0.056396, "value_mse_loss_layer_025": 0.069824, "value_mse_loss_layer_026": 0.056152, "value_mse_loss_layer_027": 0.074707, "value_mse_loss_layer_028": 0.072754, "value_mse_loss_layer_029": 0.117188, "value_mse_loss_layer_030": 0.097168, "value_mse_loss_layer_031": 0.121582, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000206, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.00033, "vq_loss_layer_010": 0.000315, "vq_loss_layer_011": 0.000427, "vq_loss_layer_012": 0.00069, "vq_loss_layer_013": 0.000492, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000618, "vq_loss_layer_016": 0.000633, "vq_loss_layer_017": 0.000486, "vq_loss_layer_018": 0.000303, "vq_loss_layer_019": 0.000235, "vq_loss_layer_020": 0.000355, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.00036, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.000439, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000725, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.001984, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.007111 }, { "ce_loss": 2.339293, "epoch": 0.00212, "grad_norm": 0.004871021490544081, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.059961, "kv_vq_loss": 0.000784, "learning_rate": 0.0008315839652321877, "loss": 0.060751, "step": 2120, "value_mse_loss_layer_000": 0.0009, "value_mse_loss_layer_001": 0.002533, "value_mse_loss_layer_002": 0.01001, "value_mse_loss_layer_003": 0.015442, "value_mse_loss_layer_004": 0.014404, "value_mse_loss_layer_005": 0.014221, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.020142, "value_mse_loss_layer_008": 0.023071, "value_mse_loss_layer_009": 0.03064, "value_mse_loss_layer_010": 0.025879, "value_mse_loss_layer_011": 0.028198, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.029053, "value_mse_loss_layer_014": 0.030884, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.028931, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.033691, "value_mse_loss_layer_019": 0.04126, "value_mse_loss_layer_020": 0.039062, "value_mse_loss_layer_021": 0.04834, "value_mse_loss_layer_022": 0.041992, "value_mse_loss_layer_023": 0.048828, "value_mse_loss_layer_024": 0.057617, "value_mse_loss_layer_025": 0.065918, "value_mse_loss_layer_026": 0.052979, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.106445, "value_mse_loss_layer_030": 0.097168, "value_mse_loss_layer_031": 0.119629, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 9.8e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000197, "vq_loss_layer_007": 0.000338, "vq_loss_layer_008": 0.000301, "vq_loss_layer_009": 0.000418, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000355, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.000546, "vq_loss_layer_015": 0.000656, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.000353, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.00033, "vq_loss_layer_021": 0.000603, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000389, "vq_loss_layer_024": 0.000406, "vq_loss_layer_025": 0.000444, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000759, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001648, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.006897 }, { "ce_loss": 2.318256, "epoch": 0.00213, "grad_norm": 0.007671797182410955, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.060495, "kv_vq_loss": 0.000827, "learning_rate": 0.0008320949008596843, "loss": 0.061313, "step": 2130, "value_mse_loss_layer_000": 0.000942, "value_mse_loss_layer_001": 0.002533, "value_mse_loss_layer_002": 0.010498, "value_mse_loss_layer_003": 0.016357, "value_mse_loss_layer_004": 0.016113, "value_mse_loss_layer_005": 0.015991, "value_mse_loss_layer_006": 0.017944, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.023193, "value_mse_loss_layer_009": 0.030884, "value_mse_loss_layer_010": 0.027222, "value_mse_loss_layer_011": 0.030151, "value_mse_loss_layer_012": 0.030029, "value_mse_loss_layer_013": 0.031982, "value_mse_loss_layer_014": 0.033203, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.029175, "value_mse_loss_layer_017": 0.03418, "value_mse_loss_layer_018": 0.03064, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.046143, "value_mse_loss_layer_021": 0.044922, "value_mse_loss_layer_022": 0.045654, "value_mse_loss_layer_023": 0.046143, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.067383, "value_mse_loss_layer_026": 0.061523, "value_mse_loss_layer_027": 0.071289, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.12793, "value_mse_loss_layer_030": 0.091797, "value_mse_loss_layer_031": 0.124512, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 5.4e-05, "vq_loss_layer_004": 0.000119, "vq_loss_layer_005": 0.000136, "vq_loss_layer_006": 0.000219, "vq_loss_layer_007": 0.000296, "vq_loss_layer_008": 0.000322, "vq_loss_layer_009": 0.000383, "vq_loss_layer_010": 0.000391, "vq_loss_layer_011": 0.000561, "vq_loss_layer_012": 0.000713, "vq_loss_layer_013": 0.000595, "vq_loss_layer_014": 0.000744, "vq_loss_layer_015": 0.00071, "vq_loss_layer_016": 0.000645, "vq_loss_layer_017": 0.000622, "vq_loss_layer_018": 0.000343, "vq_loss_layer_019": 0.000359, "vq_loss_layer_020": 0.000454, "vq_loss_layer_021": 0.000793, "vq_loss_layer_022": 0.000595, "vq_loss_layer_023": 0.000492, "vq_loss_layer_024": 0.000486, "vq_loss_layer_025": 0.000809, "vq_loss_layer_026": 0.001381, "vq_loss_layer_027": 0.001137, "vq_loss_layer_028": 0.00145, "vq_loss_layer_029": 0.003052, "vq_loss_layer_030": 0.00322, "vq_loss_layer_031": 0.009888 }, { "ce_loss": 2.268691, "epoch": 0.00214, "grad_norm": 0.005766888149082661, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.06055, "kv_vq_loss": 0.00082, "learning_rate": 0.0008326034433372975, "loss": 0.061362, "step": 2140, "value_mse_loss_layer_000": 0.000923, "value_mse_loss_layer_001": 0.002533, "value_mse_loss_layer_002": 0.01001, "value_mse_loss_layer_003": 0.015747, "value_mse_loss_layer_004": 0.013916, "value_mse_loss_layer_005": 0.014282, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.019165, "value_mse_loss_layer_008": 0.025146, "value_mse_loss_layer_009": 0.031494, "value_mse_loss_layer_010": 0.026001, "value_mse_loss_layer_011": 0.027954, "value_mse_loss_layer_012": 0.027588, "value_mse_loss_layer_013": 0.030029, "value_mse_loss_layer_014": 0.031128, "value_mse_loss_layer_015": 0.034424, "value_mse_loss_layer_016": 0.03064, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.039795, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.039062, "value_mse_loss_layer_021": 0.048828, "value_mse_loss_layer_022": 0.043701, "value_mse_loss_layer_023": 0.078613, "value_mse_loss_layer_024": 0.051025, "value_mse_loss_layer_025": 0.063965, "value_mse_loss_layer_026": 0.051514, "value_mse_loss_layer_027": 0.068848, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.105957, "value_mse_loss_layer_030": 0.091797, "value_mse_loss_layer_031": 0.120605, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000195, "vq_loss_layer_007": 0.000292, "vq_loss_layer_008": 0.000347, "vq_loss_layer_009": 0.00045, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000349, "vq_loss_layer_012": 0.000568, "vq_loss_layer_013": 0.000481, "vq_loss_layer_014": 0.000546, "vq_loss_layer_015": 0.00066, "vq_loss_layer_016": 0.000633, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000418, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.000345, "vq_loss_layer_021": 0.000664, "vq_loss_layer_022": 0.000336, "vq_loss_layer_023": 0.000896, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.000866, "vq_loss_layer_029": 0.001572, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.007019 }, { "ce_loss": 2.259659, "epoch": 0.00215, "grad_norm": 0.005097293760627508, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.060214, "kv_vq_loss": 0.000794, "learning_rate": 0.0008331096149789012, "loss": 0.061008, "step": 2150, "value_mse_loss_layer_000": 0.000874, "value_mse_loss_layer_001": 0.002487, "value_mse_loss_layer_002": 0.010376, "value_mse_loss_layer_003": 0.02063, "value_mse_loss_layer_004": 0.015076, "value_mse_loss_layer_005": 0.015198, "value_mse_loss_layer_006": 0.017456, "value_mse_loss_layer_007": 0.019043, "value_mse_loss_layer_008": 0.023315, "value_mse_loss_layer_009": 0.029785, "value_mse_loss_layer_010": 0.026978, "value_mse_loss_layer_011": 0.0271, "value_mse_loss_layer_012": 0.028931, "value_mse_loss_layer_013": 0.029419, "value_mse_loss_layer_014": 0.032227, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.028931, "value_mse_loss_layer_017": 0.033447, "value_mse_loss_layer_018": 0.033691, "value_mse_loss_layer_019": 0.036865, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.052002, "value_mse_loss_layer_022": 0.043213, "value_mse_loss_layer_023": 0.05249, "value_mse_loss_layer_024": 0.055176, "value_mse_loss_layer_025": 0.075195, "value_mse_loss_layer_026": 0.056152, "value_mse_loss_layer_027": 0.075684, "value_mse_loss_layer_028": 0.074219, "value_mse_loss_layer_029": 0.10791, "value_mse_loss_layer_030": 0.097656, "value_mse_loss_layer_031": 0.124023, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 5.6e-05, "vq_loss_layer_004": 0.000117, "vq_loss_layer_005": 0.000117, "vq_loss_layer_006": 0.000196, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000362, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000324, "vq_loss_layer_012": 0.000652, "vq_loss_layer_013": 0.000458, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000618, "vq_loss_layer_016": 0.000572, "vq_loss_layer_017": 0.000664, "vq_loss_layer_018": 0.000372, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000679, "vq_loss_layer_022": 0.000332, "vq_loss_layer_023": 0.000391, "vq_loss_layer_024": 0.00036, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000671, "vq_loss_layer_027": 0.000751, "vq_loss_layer_028": 0.00116, "vq_loss_layer_029": 0.002228, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.007874 }, { "ce_loss": 2.326384, "epoch": 0.00216, "grad_norm": 0.0062781148590147495, "key_mse_loss_layer_000": 0.002686, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.059821, "kv_vq_loss": 0.000798, "learning_rate": 0.0008336134377877326, "loss": 0.060617, "step": 2160, "value_mse_loss_layer_000": 0.000904, "value_mse_loss_layer_001": 0.002472, "value_mse_loss_layer_002": 0.010864, "value_mse_loss_layer_003": 0.017334, "value_mse_loss_layer_004": 0.016113, "value_mse_loss_layer_005": 0.015503, "value_mse_loss_layer_006": 0.017822, "value_mse_loss_layer_007": 0.020264, "value_mse_loss_layer_008": 0.023193, "value_mse_loss_layer_009": 0.030518, "value_mse_loss_layer_010": 0.026855, "value_mse_loss_layer_011": 0.029541, "value_mse_loss_layer_012": 0.028809, "value_mse_loss_layer_013": 0.03125, "value_mse_loss_layer_014": 0.033447, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.029175, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.030884, "value_mse_loss_layer_019": 0.036865, "value_mse_loss_layer_020": 0.037598, "value_mse_loss_layer_021": 0.04541, "value_mse_loss_layer_022": 0.042236, "value_mse_loss_layer_023": 0.05249, "value_mse_loss_layer_024": 0.056641, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.05542, "value_mse_loss_layer_027": 0.07373, "value_mse_loss_layer_028": 0.075195, "value_mse_loss_layer_029": 0.11084, "value_mse_loss_layer_030": 0.109375, "value_mse_loss_layer_031": 0.136719, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 5.7e-05, "vq_loss_layer_004": 0.00015, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000221, "vq_loss_layer_007": 0.000324, "vq_loss_layer_008": 0.000338, "vq_loss_layer_009": 0.000431, "vq_loss_layer_010": 0.000395, "vq_loss_layer_011": 0.000479, "vq_loss_layer_012": 0.000683, "vq_loss_layer_013": 0.000591, "vq_loss_layer_014": 0.000729, "vq_loss_layer_015": 0.00069, "vq_loss_layer_016": 0.000702, "vq_loss_layer_017": 0.000549, "vq_loss_layer_018": 0.00079, "vq_loss_layer_019": 0.000334, "vq_loss_layer_020": 0.000402, "vq_loss_layer_021": 0.000668, "vq_loss_layer_022": 0.000488, "vq_loss_layer_023": 0.000568, "vq_loss_layer_024": 0.000603, "vq_loss_layer_025": 0.000679, "vq_loss_layer_026": 0.000778, "vq_loss_layer_027": 0.000984, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.002243, "vq_loss_layer_030": 0.003799, "vq_loss_layer_031": 0.010254 }, { "ce_loss": 2.311411, "epoch": 0.00217, "grad_norm": 0.004327879752963781, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.059708, "kv_vq_loss": 0.000789, "learning_rate": 0.0008341149334621323, "loss": 0.060489, "step": 2170, "value_mse_loss_layer_000": 0.000919, "value_mse_loss_layer_001": 0.002518, "value_mse_loss_layer_002": 0.009888, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.014404, "value_mse_loss_layer_005": 0.014587, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.019165, "value_mse_loss_layer_008": 0.023438, "value_mse_loss_layer_009": 0.030151, "value_mse_loss_layer_010": 0.026123, "value_mse_loss_layer_011": 0.027832, "value_mse_loss_layer_012": 0.029297, "value_mse_loss_layer_013": 0.029541, "value_mse_loss_layer_014": 0.03125, "value_mse_loss_layer_015": 0.035645, "value_mse_loss_layer_016": 0.029297, "value_mse_loss_layer_017": 0.033936, "value_mse_loss_layer_018": 0.029663, "value_mse_loss_layer_019": 0.034668, "value_mse_loss_layer_020": 0.038086, "value_mse_loss_layer_021": 0.046875, "value_mse_loss_layer_022": 0.043457, "value_mse_loss_layer_023": 0.046631, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.054443, "value_mse_loss_layer_027": 0.068848, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.10498, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.115723, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 0.000114, "vq_loss_layer_006": 0.000192, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000309, "vq_loss_layer_009": 0.000406, "vq_loss_layer_010": 0.00034, "vq_loss_layer_011": 0.000364, "vq_loss_layer_012": 0.000664, "vq_loss_layer_013": 0.000483, "vq_loss_layer_014": 0.000576, "vq_loss_layer_015": 0.000729, "vq_loss_layer_016": 0.000595, "vq_loss_layer_017": 0.000587, "vq_loss_layer_018": 0.000282, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000355, "vq_loss_layer_021": 0.000687, "vq_loss_layer_022": 0.000397, "vq_loss_layer_023": 0.000431, "vq_loss_layer_024": 0.000374, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000763, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.006714 }, { "ce_loss": 2.250386, "epoch": 0.00218, "grad_norm": 0.004225027281790972, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.059558, "kv_vq_loss": 0.000776, "learning_rate": 0.0008346141234011511, "loss": 0.060336, "step": 2180, "value_mse_loss_layer_000": 0.000923, "value_mse_loss_layer_001": 0.002548, "value_mse_loss_layer_002": 0.010376, "value_mse_loss_layer_003": 0.016113, "value_mse_loss_layer_004": 0.014893, "value_mse_loss_layer_005": 0.015076, "value_mse_loss_layer_006": 0.017944, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.022583, "value_mse_loss_layer_009": 0.029175, "value_mse_loss_layer_010": 0.0271, "value_mse_loss_layer_011": 0.026855, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.029053, "value_mse_loss_layer_014": 0.031738, "value_mse_loss_layer_015": 0.031982, "value_mse_loss_layer_016": 0.027832, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.030518, "value_mse_loss_layer_019": 0.038086, "value_mse_loss_layer_020": 0.038086, "value_mse_loss_layer_021": 0.045898, "value_mse_loss_layer_022": 0.043213, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.05249, "value_mse_loss_layer_025": 0.064453, "value_mse_loss_layer_026": 0.054199, "value_mse_loss_layer_027": 0.072266, "value_mse_loss_layer_028": 0.070801, "value_mse_loss_layer_029": 0.109863, "value_mse_loss_layer_030": 0.09668, "value_mse_loss_layer_031": 0.126953, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000102, "vq_loss_layer_005": 0.000119, "vq_loss_layer_006": 0.00022, "vq_loss_layer_007": 0.000282, "vq_loss_layer_008": 0.00028, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.000341, "vq_loss_layer_011": 0.000366, "vq_loss_layer_012": 0.000568, "vq_loss_layer_013": 0.000565, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000595, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000553, "vq_loss_layer_018": 0.000326, "vq_loss_layer_019": 0.000362, "vq_loss_layer_020": 0.000351, "vq_loss_layer_021": 0.000607, "vq_loss_layer_022": 0.00037, "vq_loss_layer_023": 0.000357, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000771, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001984, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.008179 }, { "ce_loss": 2.269796, "epoch": 0.00219, "grad_norm": 0.0053269267082214355, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.059781, "kv_vq_loss": 0.000771, "learning_rate": 0.0008351110287100295, "loss": 0.060568, "step": 2190, "value_mse_loss_layer_000": 0.000904, "value_mse_loss_layer_001": 0.002548, "value_mse_loss_layer_002": 0.009766, "value_mse_loss_layer_003": 0.016479, "value_mse_loss_layer_004": 0.014648, "value_mse_loss_layer_005": 0.014465, "value_mse_loss_layer_006": 0.017456, "value_mse_loss_layer_007": 0.019165, "value_mse_loss_layer_008": 0.023438, "value_mse_loss_layer_009": 0.029541, "value_mse_loss_layer_010": 0.025269, "value_mse_loss_layer_011": 0.026855, "value_mse_loss_layer_012": 0.028931, "value_mse_loss_layer_013": 0.028931, "value_mse_loss_layer_014": 0.030396, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.029663, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.029907, "value_mse_loss_layer_019": 0.036133, "value_mse_loss_layer_020": 0.037354, "value_mse_loss_layer_021": 0.045898, "value_mse_loss_layer_022": 0.044189, "value_mse_loss_layer_023": 0.051025, "value_mse_loss_layer_024": 0.056641, "value_mse_loss_layer_025": 0.076172, "value_mse_loss_layer_026": 0.054199, "value_mse_loss_layer_027": 0.07373, "value_mse_loss_layer_028": 0.077637, "value_mse_loss_layer_029": 0.109863, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.121582, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000187, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000362, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000332, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.000595, "vq_loss_layer_016": 0.00061, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000546, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.000391, "vq_loss_layer_024": 0.000374, "vq_loss_layer_025": 0.000479, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000725, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001846, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.006622 }, { "ce_loss": 2.338445, "epoch": 0.0022, "grad_norm": 0.0056713642552495, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.059631, "kv_vq_loss": 0.000797, "learning_rate": 0.0008356056702055514, "loss": 0.060419, "step": 2200, "value_mse_loss_layer_000": 0.000893, "value_mse_loss_layer_001": 0.002472, "value_mse_loss_layer_002": 0.011169, "value_mse_loss_layer_003": 0.016846, "value_mse_loss_layer_004": 0.014832, "value_mse_loss_layer_005": 0.015442, "value_mse_loss_layer_006": 0.016968, "value_mse_loss_layer_007": 0.019409, "value_mse_loss_layer_008": 0.022827, "value_mse_loss_layer_009": 0.029785, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.027222, "value_mse_loss_layer_012": 0.02832, "value_mse_loss_layer_013": 0.030151, "value_mse_loss_layer_014": 0.031738, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.028442, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.037842, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.038818, "value_mse_loss_layer_021": 0.051025, "value_mse_loss_layer_022": 0.045166, "value_mse_loss_layer_023": 0.056641, "value_mse_loss_layer_024": 0.056641, "value_mse_loss_layer_025": 0.074219, "value_mse_loss_layer_026": 0.064941, "value_mse_loss_layer_027": 0.09082, "value_mse_loss_layer_028": 0.086426, "value_mse_loss_layer_029": 0.115234, "value_mse_loss_layer_030": 0.10791, "value_mse_loss_layer_031": 0.12793, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000126, "vq_loss_layer_006": 0.000167, "vq_loss_layer_007": 0.00028, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000389, "vq_loss_layer_010": 0.000328, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.000603, "vq_loss_layer_013": 0.0005, "vq_loss_layer_014": 0.000568, "vq_loss_layer_015": 0.000561, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000364, "vq_loss_layer_019": 0.000277, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000431, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000736, "vq_loss_layer_027": 0.000904, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.001801, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.00708 }, { "ce_loss": 2.323859, "epoch": 0.00221, "grad_norm": 0.0051875971257686615, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.057373, "kv_mse_loss": 0.05975, "kv_vq_loss": 0.000767, "learning_rate": 0.0008360980684212775, "loss": 0.060516, "step": 2210, "value_mse_loss_layer_000": 0.000889, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.010315, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.016113, "value_mse_loss_layer_005": 0.014832, "value_mse_loss_layer_006": 0.018555, "value_mse_loss_layer_007": 0.019409, "value_mse_loss_layer_008": 0.023193, "value_mse_loss_layer_009": 0.031006, "value_mse_loss_layer_010": 0.026001, "value_mse_loss_layer_011": 0.027466, "value_mse_loss_layer_012": 0.02832, "value_mse_loss_layer_013": 0.030762, "value_mse_loss_layer_014": 0.036133, "value_mse_loss_layer_015": 0.031494, "value_mse_loss_layer_016": 0.027222, "value_mse_loss_layer_017": 0.031006, "value_mse_loss_layer_018": 0.030151, "value_mse_loss_layer_019": 0.037354, "value_mse_loss_layer_020": 0.036133, "value_mse_loss_layer_021": 0.04248, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.048584, "value_mse_loss_layer_024": 0.064453, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.057373, "value_mse_loss_layer_027": 0.068848, "value_mse_loss_layer_028": 0.070801, "value_mse_loss_layer_029": 0.111328, "value_mse_loss_layer_030": 0.092773, "value_mse_loss_layer_031": 0.123047, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 0.000135, "vq_loss_layer_005": 0.000118, "vq_loss_layer_006": 0.000277, "vq_loss_layer_007": 0.000248, "vq_loss_layer_008": 0.000345, "vq_loss_layer_009": 0.000439, "vq_loss_layer_010": 0.00038, "vq_loss_layer_011": 0.000412, "vq_loss_layer_012": 0.000599, "vq_loss_layer_013": 0.000587, "vq_loss_layer_014": 0.000805, "vq_loss_layer_015": 0.000572, "vq_loss_layer_016": 0.000618, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000319, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000622, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000507, "vq_loss_layer_024": 0.00061, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000927, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.001404, "vq_loss_layer_029": 0.00293, "vq_loss_layer_030": 0.002991, "vq_loss_layer_031": 0.009766 }, { "ce_loss": 2.295707, "epoch": 0.00222, "grad_norm": 0.004636162426322699, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.059592, "kv_vq_loss": 0.000769, "learning_rate": 0.0008365882436126595, "loss": 0.060373, "step": 2220, "value_mse_loss_layer_000": 0.000896, "value_mse_loss_layer_001": 0.002533, "value_mse_loss_layer_002": 0.010071, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.014526, "value_mse_loss_layer_005": 0.01416, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.022827, "value_mse_loss_layer_008": 0.022339, "value_mse_loss_layer_009": 0.028564, "value_mse_loss_layer_010": 0.024414, "value_mse_loss_layer_011": 0.026611, "value_mse_loss_layer_012": 0.02832, "value_mse_loss_layer_013": 0.028809, "value_mse_loss_layer_014": 0.030518, "value_mse_loss_layer_015": 0.032715, "value_mse_loss_layer_016": 0.02832, "value_mse_loss_layer_017": 0.033936, "value_mse_loss_layer_018": 0.030396, "value_mse_loss_layer_019": 0.036133, "value_mse_loss_layer_020": 0.041016, "value_mse_loss_layer_021": 0.044922, "value_mse_loss_layer_022": 0.043701, "value_mse_loss_layer_023": 0.050049, "value_mse_loss_layer_024": 0.054688, "value_mse_loss_layer_025": 0.066895, "value_mse_loss_layer_026": 0.056152, "value_mse_loss_layer_027": 0.078125, "value_mse_loss_layer_028": 0.074219, "value_mse_loss_layer_029": 0.113281, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.12207, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000182, "vq_loss_layer_007": 0.000404, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.000326, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000357, "vq_loss_layer_012": 0.000664, "vq_loss_layer_013": 0.000507, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000618, "vq_loss_layer_016": 0.000595, "vq_loss_layer_017": 0.000587, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000347, "vq_loss_layer_023": 0.000412, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000463, "vq_loss_layer_026": 0.000698, "vq_loss_layer_027": 0.000877, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.0019, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.007172 }, { "ce_loss": 2.293614, "epoch": 0.00223, "grad_norm": 0.0058067962527275085, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.05975, "kv_vq_loss": 0.000775, "learning_rate": 0.00083707621576204, "loss": 0.060526, "step": 2230, "value_mse_loss_layer_000": 0.000896, "value_mse_loss_layer_001": 0.002533, "value_mse_loss_layer_002": 0.009827, "value_mse_loss_layer_003": 0.017212, "value_mse_loss_layer_004": 0.014038, "value_mse_loss_layer_005": 0.014404, "value_mse_loss_layer_006": 0.016968, "value_mse_loss_layer_007": 0.018677, "value_mse_loss_layer_008": 0.022461, "value_mse_loss_layer_009": 0.030151, "value_mse_loss_layer_010": 0.024536, "value_mse_loss_layer_011": 0.027466, "value_mse_loss_layer_012": 0.0271, "value_mse_loss_layer_013": 0.028564, "value_mse_loss_layer_014": 0.030151, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.028564, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.031494, "value_mse_loss_layer_019": 0.036865, "value_mse_loss_layer_020": 0.038574, "value_mse_loss_layer_021": 0.048584, "value_mse_loss_layer_022": 0.045166, "value_mse_loss_layer_023": 0.055908, "value_mse_loss_layer_024": 0.057861, "value_mse_loss_layer_025": 0.071289, "value_mse_loss_layer_026": 0.054688, "value_mse_loss_layer_027": 0.072754, "value_mse_loss_layer_028": 0.078125, "value_mse_loss_layer_029": 0.117676, "value_mse_loss_layer_030": 0.106445, "value_mse_loss_layer_031": 0.123047, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 0.000109, "vq_loss_layer_006": 0.000189, "vq_loss_layer_007": 0.000265, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000443, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000412, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000456, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000546, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.00025, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000423, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001808, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.007324 }, { "ce_loss": 2.292952, "epoch": 0.00224, "grad_norm": 0.004925692453980446, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.124023, "key_mse_loss_layer_015": 0.111816, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.05979, "kv_vq_loss": 0.000805, "learning_rate": 0.0008375620045835406, "loss": 0.060602, "step": 2240, "value_mse_loss_layer_000": 0.000893, "value_mse_loss_layer_001": 0.002518, "value_mse_loss_layer_002": 0.010315, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.015137, "value_mse_loss_layer_005": 0.01532, "value_mse_loss_layer_006": 0.018433, "value_mse_loss_layer_007": 0.020264, "value_mse_loss_layer_008": 0.022949, "value_mse_loss_layer_009": 0.031128, "value_mse_loss_layer_010": 0.026001, "value_mse_loss_layer_011": 0.027222, "value_mse_loss_layer_012": 0.027222, "value_mse_loss_layer_013": 0.029419, "value_mse_loss_layer_014": 0.037109, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.028442, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.029175, "value_mse_loss_layer_019": 0.041748, "value_mse_loss_layer_020": 0.038086, "value_mse_loss_layer_021": 0.04248, "value_mse_loss_layer_022": 0.041504, "value_mse_loss_layer_023": 0.047607, "value_mse_loss_layer_024": 0.050293, "value_mse_loss_layer_025": 0.065918, "value_mse_loss_layer_026": 0.066895, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.09082, "value_mse_loss_layer_031": 0.121582, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 9.8e-05, "vq_loss_layer_005": 0.000127, "vq_loss_layer_006": 0.000254, "vq_loss_layer_007": 0.000294, "vq_loss_layer_008": 0.000324, "vq_loss_layer_009": 0.000469, "vq_loss_layer_010": 0.000404, "vq_loss_layer_011": 0.000374, "vq_loss_layer_012": 0.00058, "vq_loss_layer_013": 0.000504, "vq_loss_layer_014": 0.00116, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000614, "vq_loss_layer_017": 0.000477, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.000341, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000633, "vq_loss_layer_022": 0.000406, "vq_loss_layer_023": 0.000519, "vq_loss_layer_024": 0.000444, "vq_loss_layer_025": 0.00066, "vq_loss_layer_026": 0.001404, "vq_loss_layer_027": 0.000797, "vq_loss_layer_028": 0.001358, "vq_loss_layer_029": 0.002014, "vq_loss_layer_030": 0.003571, "vq_loss_layer_031": 0.00885 }, { "ce_loss": 2.34351, "epoch": 0.00225, "grad_norm": 0.005144132766872644, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.059387, "kv_vq_loss": 0.000788, "learning_rate": 0.0008380456295278406, "loss": 0.060178, "step": 2250, "value_mse_loss_layer_000": 0.000893, "value_mse_loss_layer_001": 0.002518, "value_mse_loss_layer_002": 0.009827, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.014282, "value_mse_loss_layer_005": 0.014282, "value_mse_loss_layer_006": 0.016968, "value_mse_loss_layer_007": 0.018921, "value_mse_loss_layer_008": 0.022705, "value_mse_loss_layer_009": 0.028931, "value_mse_loss_layer_010": 0.025024, "value_mse_loss_layer_011": 0.02771, "value_mse_loss_layer_012": 0.030273, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.032471, "value_mse_loss_layer_015": 0.036377, "value_mse_loss_layer_016": 0.032715, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.033203, "value_mse_loss_layer_019": 0.036133, "value_mse_loss_layer_020": 0.03833, "value_mse_loss_layer_021": 0.053223, "value_mse_loss_layer_022": 0.04248, "value_mse_loss_layer_023": 0.050293, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.058594, "value_mse_loss_layer_027": 0.098145, "value_mse_loss_layer_028": 0.072754, "value_mse_loss_layer_029": 0.104492, "value_mse_loss_layer_030": 0.101074, "value_mse_loss_layer_031": 0.116211, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000261, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000345, "vq_loss_layer_010": 0.000303, "vq_loss_layer_011": 0.000402, "vq_loss_layer_012": 0.000744, "vq_loss_layer_013": 0.000443, "vq_loss_layer_014": 0.000675, "vq_loss_layer_015": 0.000957, "vq_loss_layer_016": 0.000771, "vq_loss_layer_017": 0.000603, "vq_loss_layer_018": 0.000357, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000296, "vq_loss_layer_021": 0.000755, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000416, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000816, "vq_loss_layer_027": 0.00132, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.002502, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.007935 }, { "ce_loss": 2.32655, "epoch": 0.00226, "grad_norm": 0.005084776785224676, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.060791, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.074707, "key_mse_loss_layer_007": 0.081055, "key_mse_loss_layer_008": 0.09668, "key_mse_loss_layer_009": 0.102539, "key_mse_loss_layer_010": 0.115723, "key_mse_loss_layer_011": 0.111328, "key_mse_loss_layer_012": 0.085449, "key_mse_loss_layer_013": 0.147461, "key_mse_loss_layer_014": 0.141602, "key_mse_loss_layer_015": 0.130859, "key_mse_loss_layer_016": 0.12207, "key_mse_loss_layer_017": 0.118164, "key_mse_loss_layer_018": 0.125, "key_mse_loss_layer_019": 0.101562, "key_mse_loss_layer_020": 0.116699, "key_mse_loss_layer_021": 0.110352, "key_mse_loss_layer_022": 0.114746, "key_mse_loss_layer_023": 0.111328, "key_mse_loss_layer_024": 0.09082, "key_mse_loss_layer_025": 0.083008, "key_mse_loss_layer_026": 0.101562, "key_mse_loss_layer_027": 0.100586, "key_mse_loss_layer_028": 0.10498, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.108887, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.05986, "kv_vq_loss": 0.000774, "learning_rate": 0.0008385271097868501, "loss": 0.060632, "step": 2260, "value_mse_loss_layer_000": 0.000854, "value_mse_loss_layer_001": 0.002563, "value_mse_loss_layer_002": 0.010498, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.015564, "value_mse_loss_layer_005": 0.016724, "value_mse_loss_layer_006": 0.017456, "value_mse_loss_layer_007": 0.020508, "value_mse_loss_layer_008": 0.022705, "value_mse_loss_layer_009": 0.029053, "value_mse_loss_layer_010": 0.026611, "value_mse_loss_layer_011": 0.028687, "value_mse_loss_layer_012": 0.029419, "value_mse_loss_layer_013": 0.031494, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.030518, "value_mse_loss_layer_016": 0.026611, "value_mse_loss_layer_017": 0.030151, "value_mse_loss_layer_018": 0.030518, "value_mse_loss_layer_019": 0.034668, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.04248, "value_mse_loss_layer_022": 0.038818, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.05249, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.05249, "value_mse_loss_layer_027": 0.077637, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.098633, "value_mse_loss_layer_030": 0.105469, "value_mse_loss_layer_031": 0.131836, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.7e-05, "vq_loss_layer_002": 4.1e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.000129, "vq_loss_layer_005": 0.000252, "vq_loss_layer_006": 0.000235, "vq_loss_layer_007": 0.00038, "vq_loss_layer_008": 0.000404, "vq_loss_layer_009": 0.000463, "vq_loss_layer_010": 0.000504, "vq_loss_layer_011": 0.0005, "vq_loss_layer_012": 0.000702, "vq_loss_layer_013": 0.000607, "vq_loss_layer_014": 0.000771, "vq_loss_layer_015": 0.000748, "vq_loss_layer_016": 0.000668, "vq_loss_layer_017": 0.000519, "vq_loss_layer_018": 0.000328, "vq_loss_layer_019": 0.000301, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.000782, "vq_loss_layer_022": 0.00045, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.000641, "vq_loss_layer_025": 0.001503, "vq_loss_layer_026": 0.000973, "vq_loss_layer_027": 0.001602, "vq_loss_layer_028": 0.001793, "vq_loss_layer_029": 0.002655, "vq_loss_layer_030": 0.004822, "vq_loss_layer_031": 0.012085 }, { "ce_loss": 2.321029, "epoch": 0.00227, "grad_norm": 0.004833373241126537, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.059372, "kv_vq_loss": 0.000771, "learning_rate": 0.0008390064642982805, "loss": 0.060153, "step": 2270, "value_mse_loss_layer_000": 0.000889, "value_mse_loss_layer_001": 0.002563, "value_mse_loss_layer_002": 0.009949, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.014343, "value_mse_loss_layer_005": 0.014465, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.018799, "value_mse_loss_layer_008": 0.022827, "value_mse_loss_layer_009": 0.029175, "value_mse_loss_layer_010": 0.02478, "value_mse_loss_layer_011": 0.026367, "value_mse_loss_layer_012": 0.028076, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.033447, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.02832, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.031128, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.037842, "value_mse_loss_layer_021": 0.046875, "value_mse_loss_layer_022": 0.043213, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.052979, "value_mse_loss_layer_025": 0.073242, "value_mse_loss_layer_026": 0.054932, "value_mse_loss_layer_027": 0.076172, "value_mse_loss_layer_028": 0.073242, "value_mse_loss_layer_029": 0.115723, "value_mse_loss_layer_030": 0.100586, "value_mse_loss_layer_031": 0.121094, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000195, "vq_loss_layer_007": 0.000259, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.000341, "vq_loss_layer_010": 0.000296, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000648, "vq_loss_layer_013": 0.00045, "vq_loss_layer_014": 0.000622, "vq_loss_layer_015": 0.000595, "vq_loss_layer_016": 0.000568, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000257, "vq_loss_layer_020": 0.000303, "vq_loss_layer_021": 0.000629, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.00045, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.000759, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.001747, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.007233 }, { "ce_loss": 2.29066, "epoch": 0.00228, "grad_norm": 0.0051378775388002396, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.09082, "key_mse_loss_layer_009": 0.097656, "key_mse_loss_layer_010": 0.109375, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.140625, "key_mse_loss_layer_014": 0.135742, "key_mse_loss_layer_015": 0.122559, "key_mse_loss_layer_016": 0.117676, "key_mse_loss_layer_017": 0.117188, "key_mse_loss_layer_018": 0.122559, "key_mse_loss_layer_019": 0.097656, "key_mse_loss_layer_020": 0.112305, "key_mse_loss_layer_021": 0.10498, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.108398, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.059586, "kv_vq_loss": 0.000772, "learning_rate": 0.0008394837117501133, "loss": 0.060376, "step": 2280, "value_mse_loss_layer_000": 0.000877, "value_mse_loss_layer_001": 0.002472, "value_mse_loss_layer_002": 0.009949, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.015076, "value_mse_loss_layer_005": 0.015076, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.020386, "value_mse_loss_layer_008": 0.022705, "value_mse_loss_layer_009": 0.029541, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.027588, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.029419, "value_mse_loss_layer_015": 0.029053, "value_mse_loss_layer_016": 0.025391, "value_mse_loss_layer_017": 0.030884, "value_mse_loss_layer_018": 0.028564, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.036133, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.038086, "value_mse_loss_layer_023": 0.054932, "value_mse_loss_layer_024": 0.067871, "value_mse_loss_layer_025": 0.061523, "value_mse_loss_layer_026": 0.05249, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.099121, "value_mse_loss_layer_030": 0.089844, "value_mse_loss_layer_031": 0.119629, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 3e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.000129, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000303, "vq_loss_layer_008": 0.000338, "vq_loss_layer_009": 0.000425, "vq_loss_layer_010": 0.000393, "vq_loss_layer_011": 0.000496, "vq_loss_layer_012": 0.00061, "vq_loss_layer_013": 0.000483, "vq_loss_layer_014": 0.000645, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.000317, "vq_loss_layer_019": 0.000267, "vq_loss_layer_020": 0.000334, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000809, "vq_loss_layer_024": 0.000881, "vq_loss_layer_025": 0.000679, "vq_loss_layer_026": 0.000839, "vq_loss_layer_027": 0.000809, "vq_loss_layer_028": 0.001526, "vq_loss_layer_029": 0.00235, "vq_loss_layer_030": 0.003769, "vq_loss_layer_031": 0.009216 }, { "ce_loss": 2.311841, "epoch": 0.00229, "grad_norm": 0.005315977148711681, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.059528, "kv_vq_loss": 0.000784, "learning_rate": 0.0008399588705849719, "loss": 0.060312, "step": 2290, "value_mse_loss_layer_000": 0.000881, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.009705, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.013977, "value_mse_loss_layer_005": 0.015503, "value_mse_loss_layer_006": 0.016968, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.022461, "value_mse_loss_layer_009": 0.029663, "value_mse_loss_layer_010": 0.026123, "value_mse_loss_layer_011": 0.027344, "value_mse_loss_layer_012": 0.029053, "value_mse_loss_layer_013": 0.031738, "value_mse_loss_layer_014": 0.030884, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.031006, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.029907, "value_mse_loss_layer_019": 0.036621, "value_mse_loss_layer_020": 0.04541, "value_mse_loss_layer_021": 0.050293, "value_mse_loss_layer_022": 0.041748, "value_mse_loss_layer_023": 0.052246, "value_mse_loss_layer_024": 0.054932, "value_mse_loss_layer_025": 0.063477, "value_mse_loss_layer_026": 0.054443, "value_mse_loss_layer_027": 0.070312, "value_mse_loss_layer_028": 0.070801, "value_mse_loss_layer_029": 0.110352, "value_mse_loss_layer_030": 0.097656, "value_mse_loss_layer_031": 0.11377, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000152, "vq_loss_layer_006": 0.000181, "vq_loss_layer_007": 0.000303, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000313, "vq_loss_layer_011": 0.000334, "vq_loss_layer_012": 0.000702, "vq_loss_layer_013": 0.000599, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.00058, "vq_loss_layer_016": 0.000629, "vq_loss_layer_017": 0.000481, "vq_loss_layer_018": 0.000286, "vq_loss_layer_019": 0.000273, "vq_loss_layer_020": 0.000357, "vq_loss_layer_021": 0.000618, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000463, "vq_loss_layer_024": 0.000441, "vq_loss_layer_025": 0.000395, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.000729, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.002747, "vq_loss_layer_031": 0.006897 }, { "ce_loss": 2.244874, "epoch": 0.0023, "grad_norm": 0.0056855808943510056, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.040283, "key_mse_loss_layer_005": 0.05542, "key_mse_loss_layer_006": 0.060791, "key_mse_loss_layer_007": 0.070801, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.09668, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.059436, "kv_vq_loss": 0.00079, "learning_rate": 0.0008404319590043981, "loss": 0.060226, "step": 2300, "value_mse_loss_layer_000": 0.000854, "value_mse_loss_layer_001": 0.002396, "value_mse_loss_layer_002": 0.012085, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.015198, "value_mse_loss_layer_005": 0.014771, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.019043, "value_mse_loss_layer_008": 0.023926, "value_mse_loss_layer_009": 0.029053, "value_mse_loss_layer_010": 0.025146, "value_mse_loss_layer_011": 0.026367, "value_mse_loss_layer_012": 0.027954, "value_mse_loss_layer_013": 0.030396, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.028809, "value_mse_loss_layer_017": 0.033447, "value_mse_loss_layer_018": 0.031982, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.039795, "value_mse_loss_layer_021": 0.046143, "value_mse_loss_layer_022": 0.045166, "value_mse_loss_layer_023": 0.05542, "value_mse_loss_layer_024": 0.0625, "value_mse_loss_layer_025": 0.071289, "value_mse_loss_layer_026": 0.066895, "value_mse_loss_layer_027": 0.088379, "value_mse_loss_layer_028": 0.089355, "value_mse_loss_layer_029": 0.141602, "value_mse_loss_layer_030": 0.114746, "value_mse_loss_layer_031": 0.140625, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.4e-05, "vq_loss_layer_002": 3.1e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 0.000123, "vq_loss_layer_005": 0.000115, "vq_loss_layer_006": 0.000173, "vq_loss_layer_007": 0.000242, "vq_loss_layer_008": 0.000385, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000357, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000481, "vq_loss_layer_014": 0.000587, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000467, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000422, "vq_loss_layer_024": 0.000368, "vq_loss_layer_025": 0.000433, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.001816, "vq_loss_layer_029": 0.004456, "vq_loss_layer_030": 0.003326, "vq_loss_layer_031": 0.010254 }, { "ce_loss": 2.27167, "epoch": 0.00231, "grad_norm": 0.005428717937320471, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.059381, "kv_vq_loss": 0.000775, "learning_rate": 0.0008409029949730359, "loss": 0.060168, "step": 2310, "value_mse_loss_layer_000": 0.000889, "value_mse_loss_layer_001": 0.002579, "value_mse_loss_layer_002": 0.010193, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.014832, "value_mse_loss_layer_005": 0.014465, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.019165, "value_mse_loss_layer_008": 0.022705, "value_mse_loss_layer_009": 0.028931, "value_mse_loss_layer_010": 0.028687, "value_mse_loss_layer_011": 0.027222, "value_mse_loss_layer_012": 0.027466, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.030518, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.028076, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.037354, "value_mse_loss_layer_019": 0.035156, "value_mse_loss_layer_020": 0.036377, "value_mse_loss_layer_021": 0.043457, "value_mse_loss_layer_022": 0.04126, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.053955, "value_mse_loss_layer_025": 0.074219, "value_mse_loss_layer_026": 0.052979, "value_mse_loss_layer_027": 0.070801, "value_mse_loss_layer_028": 0.084961, "value_mse_loss_layer_029": 0.106445, "value_mse_loss_layer_030": 0.09375, "value_mse_loss_layer_031": 0.123047, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000124, "vq_loss_layer_005": 0.000111, "vq_loss_layer_006": 0.000201, "vq_loss_layer_007": 0.000275, "vq_loss_layer_008": 0.000294, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.000376, "vq_loss_layer_011": 0.000401, "vq_loss_layer_012": 0.000576, "vq_loss_layer_013": 0.00046, "vq_loss_layer_014": 0.000587, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000622, "vq_loss_layer_017": 0.0005, "vq_loss_layer_018": 0.000429, "vq_loss_layer_019": 0.000315, "vq_loss_layer_020": 0.000286, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000359, "vq_loss_layer_023": 0.000412, "vq_loss_layer_024": 0.000618, "vq_loss_layer_025": 0.000553, "vq_loss_layer_026": 0.00069, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.001534, "vq_loss_layer_029": 0.001793, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.007996 }, { "ce_loss": 2.332056, "epoch": 0.00232, "grad_norm": 0.005224734544754028, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.059546, "kv_vq_loss": 0.000781, "learning_rate": 0.0008413719962227249, "loss": 0.060318, "step": 2320, "value_mse_loss_layer_000": 0.000881, "value_mse_loss_layer_001": 0.002533, "value_mse_loss_layer_002": 0.010498, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.015076, "value_mse_loss_layer_005": 0.014526, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.018921, "value_mse_loss_layer_008": 0.02356, "value_mse_loss_layer_009": 0.028687, "value_mse_loss_layer_010": 0.025146, "value_mse_loss_layer_011": 0.025879, "value_mse_loss_layer_012": 0.027466, "value_mse_loss_layer_013": 0.027954, "value_mse_loss_layer_014": 0.029541, "value_mse_loss_layer_015": 0.031494, "value_mse_loss_layer_016": 0.033447, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.031738, "value_mse_loss_layer_019": 0.037354, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.044434, "value_mse_loss_layer_022": 0.042236, "value_mse_loss_layer_023": 0.050781, "value_mse_loss_layer_024": 0.075195, "value_mse_loss_layer_025": 0.068359, "value_mse_loss_layer_026": 0.061523, "value_mse_loss_layer_027": 0.07373, "value_mse_loss_layer_028": 0.077637, "value_mse_loss_layer_029": 0.115234, "value_mse_loss_layer_030": 0.099121, "value_mse_loss_layer_031": 0.130859, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 0.000106, "vq_loss_layer_005": 0.000114, "vq_loss_layer_006": 0.000187, "vq_loss_layer_007": 0.000271, "vq_loss_layer_008": 0.000372, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.000366, "vq_loss_layer_011": 0.000328, "vq_loss_layer_012": 0.000629, "vq_loss_layer_013": 0.000473, "vq_loss_layer_014": 0.000546, "vq_loss_layer_015": 0.000648, "vq_loss_layer_016": 0.000801, "vq_loss_layer_017": 0.000584, "vq_loss_layer_018": 0.000408, "vq_loss_layer_019": 0.000349, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000572, "vq_loss_layer_022": 0.000364, "vq_loss_layer_023": 0.000429, "vq_loss_layer_024": 0.00069, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000938, "vq_loss_layer_027": 0.000763, "vq_loss_layer_028": 0.001266, "vq_loss_layer_029": 0.00235, "vq_loss_layer_030": 0.00293, "vq_loss_layer_031": 0.009705 }, { "ce_loss": 2.328851, "epoch": 0.00233, "grad_norm": 0.005303952842950821, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.09375, "key_mse_loss_layer_030": 0.09668, "key_mse_loss_layer_031": 0.084473, "kv_mse_loss": 0.059604, "kv_vq_loss": 0.000796, "learning_rate": 0.0008418389802565046, "loss": 0.060406, "step": 2330, "value_mse_loss_layer_000": 0.000866, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.009949, "value_mse_loss_layer_003": 0.016602, "value_mse_loss_layer_004": 0.014343, "value_mse_loss_layer_005": 0.014526, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.022339, "value_mse_loss_layer_009": 0.029541, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.0271, "value_mse_loss_layer_012": 0.028687, "value_mse_loss_layer_013": 0.027832, "value_mse_loss_layer_014": 0.032715, "value_mse_loss_layer_015": 0.030884, "value_mse_loss_layer_016": 0.029541, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.031006, "value_mse_loss_layer_019": 0.039062, "value_mse_loss_layer_020": 0.047119, "value_mse_loss_layer_021": 0.042725, "value_mse_loss_layer_022": 0.045654, "value_mse_loss_layer_023": 0.051025, "value_mse_loss_layer_024": 0.065918, "value_mse_loss_layer_025": 0.066895, "value_mse_loss_layer_026": 0.05957, "value_mse_loss_layer_027": 0.086426, "value_mse_loss_layer_028": 0.079102, "value_mse_loss_layer_029": 0.116699, "value_mse_loss_layer_030": 0.117676, "value_mse_loss_layer_031": 0.125977, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000186, "vq_loss_layer_007": 0.000243, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000429, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.00042, "vq_loss_layer_012": 0.000702, "vq_loss_layer_013": 0.000439, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000561, "vq_loss_layer_016": 0.000633, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000414, "vq_loss_layer_021": 0.00046, "vq_loss_layer_022": 0.000383, "vq_loss_layer_023": 0.000412, "vq_loss_layer_024": 0.000576, "vq_loss_layer_025": 0.00053, "vq_loss_layer_026": 0.000908, "vq_loss_layer_027": 0.001389, "vq_loss_layer_028": 0.001671, "vq_loss_layer_029": 0.003784, "vq_loss_layer_030": 0.004486, "vq_loss_layer_031": 0.00946 }, { "ce_loss": 2.311535, "epoch": 0.00234, "grad_norm": 0.004962447565048933, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.062012, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.059479, "kv_vq_loss": 0.000773, "learning_rate": 0.0008423039643525355, "loss": 0.060263, "step": 2340, "value_mse_loss_layer_000": 0.000889, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.009644, "value_mse_loss_layer_003": 0.014771, "value_mse_loss_layer_004": 0.014221, "value_mse_loss_layer_005": 0.013977, "value_mse_loss_layer_006": 0.016724, "value_mse_loss_layer_007": 0.019897, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.029053, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.026978, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.029175, "value_mse_loss_layer_014": 0.029663, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.02771, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.030151, "value_mse_loss_layer_019": 0.037109, "value_mse_loss_layer_020": 0.037354, "value_mse_loss_layer_021": 0.044189, "value_mse_loss_layer_022": 0.043457, "value_mse_loss_layer_023": 0.046875, "value_mse_loss_layer_024": 0.052979, "value_mse_loss_layer_025": 0.069824, "value_mse_loss_layer_026": 0.063477, "value_mse_loss_layer_027": 0.071289, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.105469, "value_mse_loss_layer_030": 0.09375, "value_mse_loss_layer_031": 0.11377, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 0.000119, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000191, "vq_loss_layer_007": 0.000347, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000359, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000351, "vq_loss_layer_012": 0.000568, "vq_loss_layer_013": 0.000492, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.000648, "vq_loss_layer_016": 0.000538, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.00025, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000538, "vq_loss_layer_022": 0.000343, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.000452, "vq_loss_layer_026": 0.000961, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001663, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.006165 }, { "ce_loss": 2.264725, "epoch": 0.00235, "grad_norm": 0.005320502910763025, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.060147, "kv_vq_loss": 0.000818, "learning_rate": 0.000842766965567934, "loss": 0.060959, "step": 2350, "value_mse_loss_layer_000": 0.000858, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.009888, "value_mse_loss_layer_003": 0.020874, "value_mse_loss_layer_004": 0.014038, "value_mse_loss_layer_005": 0.014343, "value_mse_loss_layer_006": 0.016724, "value_mse_loss_layer_007": 0.018799, "value_mse_loss_layer_008": 0.022705, "value_mse_loss_layer_009": 0.028564, "value_mse_loss_layer_010": 0.028931, "value_mse_loss_layer_011": 0.026001, "value_mse_loss_layer_012": 0.027344, "value_mse_loss_layer_013": 0.027832, "value_mse_loss_layer_014": 0.031006, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.03064, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.029785, "value_mse_loss_layer_019": 0.034912, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.062256, "value_mse_loss_layer_022": 0.041992, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.05249, "value_mse_loss_layer_025": 0.072754, "value_mse_loss_layer_026": 0.054688, "value_mse_loss_layer_027": 0.070312, "value_mse_loss_layer_028": 0.077148, "value_mse_loss_layer_029": 0.105957, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.118164, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 0.000106, "vq_loss_layer_006": 0.000185, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000357, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000328, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000572, "vq_loss_layer_015": 0.000652, "vq_loss_layer_016": 0.000656, "vq_loss_layer_017": 0.000805, "vq_loss_layer_018": 0.000286, "vq_loss_layer_019": 0.000241, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.000881, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000446, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.000486, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.001816, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.007294 }, { "ce_loss": 2.312387, "epoch": 0.00236, "grad_norm": 0.004773188382387161, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.059637, "kv_vq_loss": 0.000788, "learning_rate": 0.0008432280007425265, "loss": 0.060428, "step": 2360, "value_mse_loss_layer_000": 0.0009, "value_mse_loss_layer_001": 0.002762, "value_mse_loss_layer_002": 0.010132, "value_mse_loss_layer_003": 0.015625, "value_mse_loss_layer_004": 0.013916, "value_mse_loss_layer_005": 0.014526, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.018311, "value_mse_loss_layer_008": 0.021973, "value_mse_loss_layer_009": 0.028564, "value_mse_loss_layer_010": 0.025269, "value_mse_loss_layer_011": 0.026245, "value_mse_loss_layer_012": 0.031982, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.030151, "value_mse_loss_layer_015": 0.031006, "value_mse_loss_layer_016": 0.026978, "value_mse_loss_layer_017": 0.031128, "value_mse_loss_layer_018": 0.031982, "value_mse_loss_layer_019": 0.034668, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.044189, "value_mse_loss_layer_022": 0.043213, "value_mse_loss_layer_023": 0.049561, "value_mse_loss_layer_024": 0.049316, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.051514, "value_mse_loss_layer_027": 0.072754, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.10498, "value_mse_loss_layer_030": 0.100586, "value_mse_loss_layer_031": 0.117676, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000254, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000355, "vq_loss_layer_010": 0.000301, "vq_loss_layer_011": 0.000368, "vq_loss_layer_012": 0.000984, "vq_loss_layer_013": 0.000477, "vq_loss_layer_014": 0.000603, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000546, "vq_loss_layer_017": 0.000557, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000227, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000435, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000793, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.001717, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.007111 }, { "ce_loss": 2.306731, "epoch": 0.00237, "grad_norm": 0.004054947756230831, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.059622, "kv_vq_loss": 0.00077, "learning_rate": 0.0008436870865025258, "loss": 0.060406, "step": 2370, "value_mse_loss_layer_000": 0.000874, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.009949, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.014648, "value_mse_loss_layer_005": 0.014893, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.020996, "value_mse_loss_layer_008": 0.022705, "value_mse_loss_layer_009": 0.02832, "value_mse_loss_layer_010": 0.02478, "value_mse_loss_layer_011": 0.026855, "value_mse_loss_layer_012": 0.028687, "value_mse_loss_layer_013": 0.031982, "value_mse_loss_layer_014": 0.030029, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.0271, "value_mse_loss_layer_017": 0.030762, "value_mse_loss_layer_018": 0.029663, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.037109, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.044434, "value_mse_loss_layer_024": 0.057373, "value_mse_loss_layer_025": 0.063965, "value_mse_loss_layer_026": 0.05127, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.076172, "value_mse_loss_layer_029": 0.098145, "value_mse_loss_layer_030": 0.09375, "value_mse_loss_layer_031": 0.114258, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 0.000109, "vq_loss_layer_005": 0.000147, "vq_loss_layer_006": 0.000218, "vq_loss_layer_007": 0.000389, "vq_loss_layer_008": 0.000341, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000387, "vq_loss_layer_012": 0.000656, "vq_loss_layer_013": 0.000698, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000778, "vq_loss_layer_016": 0.000576, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.00032, "vq_loss_layer_019": 0.000326, "vq_loss_layer_020": 0.00037, "vq_loss_layer_021": 0.000664, "vq_loss_layer_022": 0.000422, "vq_loss_layer_023": 0.000477, "vq_loss_layer_024": 0.000618, "vq_loss_layer_025": 0.000622, "vq_loss_layer_026": 0.000824, "vq_loss_layer_027": 0.000828, "vq_loss_layer_028": 0.001396, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.007507 }, { "ce_loss": 2.294463, "epoch": 0.00238, "grad_norm": 0.004377620294690132, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.059125, "kv_vq_loss": 0.000751, "learning_rate": 0.0008441442392641278, "loss": 0.059894, "step": 2380, "value_mse_loss_layer_000": 0.000893, "value_mse_loss_layer_001": 0.002502, "value_mse_loss_layer_002": 0.009827, "value_mse_loss_layer_003": 0.015442, "value_mse_loss_layer_004": 0.014404, "value_mse_loss_layer_005": 0.014648, "value_mse_loss_layer_006": 0.017212, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.021851, "value_mse_loss_layer_009": 0.028687, "value_mse_loss_layer_010": 0.02478, "value_mse_loss_layer_011": 0.026733, "value_mse_loss_layer_012": 0.026001, "value_mse_loss_layer_013": 0.027954, "value_mse_loss_layer_014": 0.028687, "value_mse_loss_layer_015": 0.030029, "value_mse_loss_layer_016": 0.029419, "value_mse_loss_layer_017": 0.030273, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.036865, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.040039, "value_mse_loss_layer_023": 0.045166, "value_mse_loss_layer_024": 0.050537, "value_mse_loss_layer_025": 0.061523, "value_mse_loss_layer_026": 0.051025, "value_mse_loss_layer_027": 0.066895, "value_mse_loss_layer_028": 0.067383, "value_mse_loss_layer_029": 0.101562, "value_mse_loss_layer_030": 0.09082, "value_mse_loss_layer_031": 0.120605, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000115, "vq_loss_layer_006": 0.000212, "vq_loss_layer_007": 0.00025, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000401, "vq_loss_layer_010": 0.000353, "vq_loss_layer_011": 0.000395, "vq_loss_layer_012": 0.000538, "vq_loss_layer_013": 0.000431, "vq_loss_layer_014": 0.000591, "vq_loss_layer_015": 0.000576, "vq_loss_layer_016": 0.000648, "vq_loss_layer_017": 0.000444, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.00034, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.000372, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.000748, "vq_loss_layer_027": 0.000809, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001984, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.007996 }, { "ce_loss": 2.299841, "epoch": 0.00239, "grad_norm": 0.005769800860434771, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.059689, "kv_vq_loss": 0.000771, "learning_rate": 0.0008445994752370343, "loss": 0.060455, "step": 2390, "value_mse_loss_layer_000": 0.000881, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.010193, "value_mse_loss_layer_003": 0.016846, "value_mse_loss_layer_004": 0.013855, "value_mse_loss_layer_005": 0.013855, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.018921, "value_mse_loss_layer_008": 0.022583, "value_mse_loss_layer_009": 0.029175, "value_mse_loss_layer_010": 0.025757, "value_mse_loss_layer_011": 0.026855, "value_mse_loss_layer_012": 0.039062, "value_mse_loss_layer_013": 0.028931, "value_mse_loss_layer_014": 0.030273, "value_mse_loss_layer_015": 0.032715, "value_mse_loss_layer_016": 0.028809, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.040527, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.038574, "value_mse_loss_layer_021": 0.052979, "value_mse_loss_layer_022": 0.043213, "value_mse_loss_layer_023": 0.050537, "value_mse_loss_layer_024": 0.054199, "value_mse_loss_layer_025": 0.075684, "value_mse_loss_layer_026": 0.065918, "value_mse_loss_layer_027": 0.07666, "value_mse_loss_layer_028": 0.081055, "value_mse_loss_layer_029": 0.114258, "value_mse_loss_layer_030": 0.097656, "value_mse_loss_layer_031": 0.118652, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 9.4e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.000254, "vq_loss_layer_008": 0.000265, "vq_loss_layer_009": 0.000364, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.001373, "vq_loss_layer_013": 0.000441, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000576, "vq_loss_layer_016": 0.000565, "vq_loss_layer_017": 0.000496, "vq_loss_layer_018": 0.000452, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.00066, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000427, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000557, "vq_loss_layer_026": 0.000919, "vq_loss_layer_027": 0.000748, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001793, "vq_loss_layer_030": 0.003067, "vq_loss_layer_031": 0.006989 }, { "ce_loss": 2.296658, "epoch": 0.0024, "grad_norm": 0.005092536099255085, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.059381, "kv_vq_loss": 0.00078, "learning_rate": 0.0008450528104279014, "loss": 0.060168, "step": 2400, "value_mse_loss_layer_000": 0.00087, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.009949, "value_mse_loss_layer_003": 0.015259, "value_mse_loss_layer_004": 0.016479, "value_mse_loss_layer_005": 0.014282, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.018433, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.030029, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.026245, "value_mse_loss_layer_012": 0.027588, "value_mse_loss_layer_013": 0.027954, "value_mse_loss_layer_014": 0.030762, "value_mse_loss_layer_015": 0.032227, "value_mse_loss_layer_016": 0.027832, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.030273, "value_mse_loss_layer_019": 0.036865, "value_mse_loss_layer_020": 0.037109, "value_mse_loss_layer_021": 0.046387, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.053223, "value_mse_loss_layer_024": 0.05249, "value_mse_loss_layer_025": 0.071777, "value_mse_loss_layer_026": 0.061279, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.106934, "value_mse_loss_layer_030": 0.090332, "value_mse_loss_layer_031": 0.115234, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 0.000183, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.000254, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000429, "vq_loss_layer_010": 0.000299, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000408, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000538, "vq_loss_layer_017": 0.000603, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000481, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.000813, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001762, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.006805 }, { "ce_loss": 2.302448, "epoch": 0.00241, "grad_norm": 0.006113487761467695, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.099609, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.108398, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.139648, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.119629, "key_mse_loss_layer_016": 0.115234, "key_mse_loss_layer_017": 0.11084, "key_mse_loss_layer_018": 0.121094, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.106445, "key_mse_loss_layer_023": 0.103027, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05943, "kv_vq_loss": 0.000765, "learning_rate": 0.000845504260643717, "loss": 0.060202, "step": 2410, "value_mse_loss_layer_000": 0.000854, "value_mse_loss_layer_001": 0.002411, "value_mse_loss_layer_002": 0.009949, "value_mse_loss_layer_003": 0.016113, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.015198, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.020386, "value_mse_loss_layer_008": 0.023682, "value_mse_loss_layer_009": 0.029053, "value_mse_loss_layer_010": 0.0271, "value_mse_loss_layer_011": 0.027466, "value_mse_loss_layer_012": 0.02832, "value_mse_loss_layer_013": 0.029785, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.030518, "value_mse_loss_layer_016": 0.026123, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.030029, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.041748, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.059814, "value_mse_loss_layer_026": 0.04834, "value_mse_loss_layer_027": 0.100586, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.094238, "value_mse_loss_layer_030": 0.094727, "value_mse_loss_layer_031": 0.122559, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.4e-05, "vq_loss_layer_002": 3.1e-05, "vq_loss_layer_003": 5.3e-05, "vq_loss_layer_004": 0.00013, "vq_loss_layer_005": 0.000152, "vq_loss_layer_006": 0.000259, "vq_loss_layer_007": 0.000338, "vq_loss_layer_008": 0.000437, "vq_loss_layer_009": 0.000416, "vq_loss_layer_010": 0.000454, "vq_loss_layer_011": 0.000471, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.000542, "vq_loss_layer_014": 0.000767, "vq_loss_layer_015": 0.000648, "vq_loss_layer_016": 0.000637, "vq_loss_layer_017": 0.000441, "vq_loss_layer_018": 0.000328, "vq_loss_layer_019": 0.000263, "vq_loss_layer_020": 0.000296, "vq_loss_layer_021": 0.000626, "vq_loss_layer_022": 0.000418, "vq_loss_layer_023": 0.00038, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.000809, "vq_loss_layer_026": 0.000771, "vq_loss_layer_027": 0.002121, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.002243, "vq_loss_layer_030": 0.003937, "vq_loss_layer_031": 0.01001 }, { "ce_loss": 2.264133, "epoch": 0.00242, "grad_norm": 0.005620853044092655, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.058908, "kv_vq_loss": 0.000773, "learning_rate": 0.0008459538414951076, "loss": 0.05968, "step": 2420, "value_mse_loss_layer_000": 0.000839, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.010437, "value_mse_loss_layer_003": 0.016357, "value_mse_loss_layer_004": 0.014099, "value_mse_loss_layer_005": 0.014526, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.019043, "value_mse_loss_layer_008": 0.022339, "value_mse_loss_layer_009": 0.028687, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.027222, "value_mse_loss_layer_012": 0.028198, "value_mse_loss_layer_013": 0.029419, "value_mse_loss_layer_014": 0.031128, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.03125, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.033447, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.042969, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.045166, "value_mse_loss_layer_024": 0.04834, "value_mse_loss_layer_025": 0.068848, "value_mse_loss_layer_026": 0.050049, "value_mse_loss_layer_027": 0.074707, "value_mse_loss_layer_028": 0.072754, "value_mse_loss_layer_029": 0.102051, "value_mse_loss_layer_030": 0.090332, "value_mse_loss_layer_031": 0.115723, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000199, "vq_loss_layer_007": 0.000278, "vq_loss_layer_008": 0.000299, "vq_loss_layer_009": 0.000357, "vq_loss_layer_010": 0.00037, "vq_loss_layer_011": 0.000402, "vq_loss_layer_012": 0.000618, "vq_loss_layer_013": 0.000483, "vq_loss_layer_014": 0.000626, "vq_loss_layer_015": 0.000778, "vq_loss_layer_016": 0.000713, "vq_loss_layer_017": 0.000557, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000299, "vq_loss_layer_020": 0.000393, "vq_loss_layer_021": 0.000637, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000446, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.000698, "vq_loss_layer_026": 0.000698, "vq_loss_layer_027": 0.001091, "vq_loss_layer_028": 0.001511, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.003143, "vq_loss_layer_031": 0.008667 }, { "ce_loss": 2.267556, "epoch": 0.00243, "grad_norm": 0.004877913743257523, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.058774, "kv_vq_loss": 0.000754, "learning_rate": 0.0008464015683995779, "loss": 0.059543, "step": 2430, "value_mse_loss_layer_000": 0.000896, "value_mse_loss_layer_001": 0.002518, "value_mse_loss_layer_002": 0.010132, "value_mse_loss_layer_003": 0.015381, "value_mse_loss_layer_004": 0.014893, "value_mse_loss_layer_005": 0.015198, "value_mse_loss_layer_006": 0.016968, "value_mse_loss_layer_007": 0.019775, "value_mse_loss_layer_008": 0.022339, "value_mse_loss_layer_009": 0.029785, "value_mse_loss_layer_010": 0.026489, "value_mse_loss_layer_011": 0.02771, "value_mse_loss_layer_012": 0.029175, "value_mse_loss_layer_013": 0.031494, "value_mse_loss_layer_014": 0.032715, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.029907, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.029907, "value_mse_loss_layer_019": 0.03418, "value_mse_loss_layer_020": 0.036865, "value_mse_loss_layer_021": 0.045654, "value_mse_loss_layer_022": 0.045898, "value_mse_loss_layer_023": 0.050781, "value_mse_loss_layer_024": 0.051025, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.050537, "value_mse_loss_layer_027": 0.068359, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.100098, "value_mse_loss_layer_030": 0.088379, "value_mse_loss_layer_031": 0.112305, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 9.8e-05, "vq_loss_layer_005": 0.000135, "vq_loss_layer_006": 0.000178, "vq_loss_layer_007": 0.000298, "vq_loss_layer_008": 0.000265, "vq_loss_layer_009": 0.000353, "vq_loss_layer_010": 0.000372, "vq_loss_layer_011": 0.000397, "vq_loss_layer_012": 0.000664, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000679, "vq_loss_layer_015": 0.000626, "vq_loss_layer_016": 0.000614, "vq_loss_layer_017": 0.000534, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.000336, "vq_loss_layer_021": 0.00071, "vq_loss_layer_022": 0.000469, "vq_loss_layer_023": 0.000542, "vq_loss_layer_024": 0.000443, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000713, "vq_loss_layer_027": 0.000973, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.002182, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.008118 }, { "ce_loss": 2.274156, "epoch": 0.00244, "grad_norm": 0.005898339673876762, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.059213, "kv_vq_loss": 0.000749, "learning_rate": 0.0008468474565846822, "loss": 0.059988, "step": 2440, "value_mse_loss_layer_000": 0.000847, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.009766, "value_mse_loss_layer_003": 0.015259, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.013855, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.018677, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.028198, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.026489, "value_mse_loss_layer_012": 0.030273, "value_mse_loss_layer_013": 0.02832, "value_mse_loss_layer_014": 0.029419, "value_mse_loss_layer_015": 0.031982, "value_mse_loss_layer_016": 0.028076, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.030273, "value_mse_loss_layer_019": 0.037109, "value_mse_loss_layer_020": 0.035889, "value_mse_loss_layer_021": 0.04248, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.04541, "value_mse_loss_layer_024": 0.051758, "value_mse_loss_layer_025": 0.061279, "value_mse_loss_layer_026": 0.053223, "value_mse_loss_layer_027": 0.068848, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.116211, "value_mse_loss_layer_030": 0.094238, "value_mse_loss_layer_031": 0.115234, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000108, "vq_loss_layer_006": 0.00018, "vq_loss_layer_007": 0.000278, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000317, "vq_loss_layer_010": 0.000311, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000778, "vq_loss_layer_013": 0.000538, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000526, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.000303, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.000866, "vq_loss_layer_029": 0.001648, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.006836 }, { "ce_loss": 2.266423, "epoch": 0.00245, "grad_norm": 0.005008513107895851, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.059329, "kv_vq_loss": 0.000777, "learning_rate": 0.0008472915210911329, "loss": 0.06012, "step": 2450, "value_mse_loss_layer_000": 0.000885, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.009705, "value_mse_loss_layer_003": 0.015625, "value_mse_loss_layer_004": 0.014099, "value_mse_loss_layer_005": 0.014099, "value_mse_loss_layer_006": 0.016724, "value_mse_loss_layer_007": 0.019287, "value_mse_loss_layer_008": 0.022949, "value_mse_loss_layer_009": 0.029175, "value_mse_loss_layer_010": 0.024048, "value_mse_loss_layer_011": 0.028198, "value_mse_loss_layer_012": 0.026855, "value_mse_loss_layer_013": 0.028564, "value_mse_loss_layer_014": 0.029053, "value_mse_loss_layer_015": 0.031494, "value_mse_loss_layer_016": 0.026978, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.036865, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.036377, "value_mse_loss_layer_021": 0.049072, "value_mse_loss_layer_022": 0.039795, "value_mse_loss_layer_023": 0.047119, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.061523, "value_mse_loss_layer_026": 0.053955, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.067383, "value_mse_loss_layer_029": 0.093262, "value_mse_loss_layer_030": 0.092285, "value_mse_loss_layer_031": 0.114746, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000301, "vq_loss_layer_008": 0.000343, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000504, "vq_loss_layer_012": 0.000557, "vq_loss_layer_013": 0.000467, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.000561, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.00058, "vq_loss_layer_018": 0.000568, "vq_loss_layer_019": 0.000278, "vq_loss_layer_020": 0.000341, "vq_loss_layer_021": 0.000706, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000458, "vq_loss_layer_024": 0.000429, "vq_loss_layer_025": 0.000462, "vq_loss_layer_026": 0.000824, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001709, "vq_loss_layer_030": 0.003128, "vq_loss_layer_031": 0.007599 }, { "ce_loss": 2.315772, "epoch": 0.00246, "grad_norm": 0.00396402133628726, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.059814, "kv_mse_loss": 0.059256, "kv_vq_loss": 0.000757, "learning_rate": 0.0008477337767758446, "loss": 0.060025, "step": 2460, "value_mse_loss_layer_000": 0.000854, "value_mse_loss_layer_001": 0.002426, "value_mse_loss_layer_002": 0.009827, "value_mse_loss_layer_003": 0.015747, "value_mse_loss_layer_004": 0.014954, "value_mse_loss_layer_005": 0.014343, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.018921, "value_mse_loss_layer_008": 0.023071, "value_mse_loss_layer_009": 0.029053, "value_mse_loss_layer_010": 0.025513, "value_mse_loss_layer_011": 0.026123, "value_mse_loss_layer_012": 0.0271, "value_mse_loss_layer_013": 0.028687, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.03418, "value_mse_loss_layer_016": 0.029541, "value_mse_loss_layer_017": 0.031006, "value_mse_loss_layer_018": 0.032227, "value_mse_loss_layer_019": 0.034912, "value_mse_loss_layer_020": 0.037109, "value_mse_loss_layer_021": 0.042725, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.047119, "value_mse_loss_layer_024": 0.05542, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.054688, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.069824, "value_mse_loss_layer_029": 0.10498, "value_mse_loss_layer_030": 0.09668, "value_mse_loss_layer_031": 0.118164, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 0.000106, "vq_loss_layer_005": 0.000106, "vq_loss_layer_006": 0.000209, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.00033, "vq_loss_layer_009": 0.00037, "vq_loss_layer_010": 0.000317, "vq_loss_layer_011": 0.00036, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.000479, "vq_loss_layer_014": 0.000641, "vq_loss_layer_015": 0.000679, "vq_loss_layer_016": 0.000671, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.000332, "vq_loss_layer_019": 0.000299, "vq_loss_layer_020": 0.000332, "vq_loss_layer_021": 0.000565, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000395, "vq_loss_layer_024": 0.000473, "vq_loss_layer_025": 0.000433, "vq_loss_layer_026": 0.000706, "vq_loss_layer_027": 0.000751, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001869, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.008057 }, { "ce_loss": 2.334475, "epoch": 0.00247, "grad_norm": 0.005638073664158583, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.062012, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.058746, "kv_vq_loss": 0.000754, "learning_rate": 0.0008481742383149162, "loss": 0.059518, "step": 2470, "value_mse_loss_layer_000": 0.000881, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014954, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.014038, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.018799, "value_mse_loss_layer_008": 0.021973, "value_mse_loss_layer_009": 0.02771, "value_mse_loss_layer_010": 0.02417, "value_mse_loss_layer_011": 0.025391, "value_mse_loss_layer_012": 0.033936, "value_mse_loss_layer_013": 0.030029, "value_mse_loss_layer_014": 0.029907, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.027222, "value_mse_loss_layer_017": 0.030884, "value_mse_loss_layer_018": 0.030884, "value_mse_loss_layer_019": 0.042236, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.043945, "value_mse_loss_layer_022": 0.045166, "value_mse_loss_layer_023": 0.04834, "value_mse_loss_layer_024": 0.058838, "value_mse_loss_layer_025": 0.066895, "value_mse_loss_layer_026": 0.053467, "value_mse_loss_layer_027": 0.078125, "value_mse_loss_layer_028": 0.074219, "value_mse_loss_layer_029": 0.116211, "value_mse_loss_layer_030": 0.094727, "value_mse_loss_layer_031": 0.116699, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000174, "vq_loss_layer_007": 0.000288, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000317, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.001099, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000519, "vq_loss_layer_017": 0.000452, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000305, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.000492, "vq_loss_layer_022": 0.000351, "vq_loss_layer_023": 0.000322, "vq_loss_layer_024": 0.000401, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.00079, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.001526, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.006287 }, { "ce_loss": 2.280397, "epoch": 0.00248, "grad_norm": 0.005819315556436777, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.11084, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.059396, "kv_vq_loss": 0.000767, "learning_rate": 0.0008486129202065539, "loss": 0.060181, "step": 2480, "value_mse_loss_layer_000": 0.000862, "value_mse_loss_layer_001": 0.002396, "value_mse_loss_layer_002": 0.010254, "value_mse_loss_layer_003": 0.015747, "value_mse_loss_layer_004": 0.017944, "value_mse_loss_layer_005": 0.014038, "value_mse_loss_layer_006": 0.017944, "value_mse_loss_layer_007": 0.018311, "value_mse_loss_layer_008": 0.022217, "value_mse_loss_layer_009": 0.028198, "value_mse_loss_layer_010": 0.02417, "value_mse_loss_layer_011": 0.025146, "value_mse_loss_layer_012": 0.027466, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.028687, "value_mse_loss_layer_015": 0.029663, "value_mse_loss_layer_016": 0.025391, "value_mse_loss_layer_017": 0.031006, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.035645, "value_mse_loss_layer_021": 0.043213, "value_mse_loss_layer_022": 0.039551, "value_mse_loss_layer_023": 0.045898, "value_mse_loss_layer_024": 0.049316, "value_mse_loss_layer_025": 0.068848, "value_mse_loss_layer_026": 0.053467, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.066406, "value_mse_loss_layer_029": 0.09375, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.118652, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.000257, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000257, "vq_loss_layer_007": 0.000261, "vq_loss_layer_008": 0.000315, "vq_loss_layer_009": 0.000393, "vq_loss_layer_010": 0.000309, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000656, "vq_loss_layer_013": 0.000473, "vq_loss_layer_014": 0.000599, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000546, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000652, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000469, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000645, "vq_loss_layer_026": 0.000843, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.001099, "vq_loss_layer_029": 0.001465, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.007751 }, { "ce_loss": 2.26286, "epoch": 0.00249, "grad_norm": 0.0052781617268919945, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.085938, "key_mse_loss_layer_023": 0.083496, "key_mse_loss_layer_024": 0.065918, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.072754, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.059134, "kv_vq_loss": 0.000747, "learning_rate": 0.0008490498367739339, "loss": 0.0599, "step": 2490, "value_mse_loss_layer_000": 0.000893, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.015564, "value_mse_loss_layer_004": 0.014038, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.02771, "value_mse_loss_layer_010": 0.023926, "value_mse_loss_layer_011": 0.025879, "value_mse_loss_layer_012": 0.026733, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.029053, "value_mse_loss_layer_015": 0.0354, "value_mse_loss_layer_016": 0.027344, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.03125, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.035889, "value_mse_loss_layer_021": 0.042236, "value_mse_loss_layer_022": 0.040283, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.048584, "value_mse_loss_layer_025": 0.069824, "value_mse_loss_layer_026": 0.051025, "value_mse_loss_layer_027": 0.066895, "value_mse_loss_layer_028": 0.066895, "value_mse_loss_layer_029": 0.102051, "value_mse_loss_layer_030": 0.097168, "value_mse_loss_layer_031": 0.109863, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000174, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000241, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.000546, "vq_loss_layer_013": 0.000423, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000755, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000418, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000481, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.000961, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.00296, "vq_loss_layer_031": 0.006836 }, { "ce_loss": 2.304923, "epoch": 0.0025, "grad_norm": 0.004868125077337027, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.060791, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.058801, "kv_vq_loss": 0.000764, "learning_rate": 0.0008494850021680092, "loss": 0.059573, "step": 2500, "value_mse_loss_layer_000": 0.000862, "value_mse_loss_layer_001": 0.002426, "value_mse_loss_layer_002": 0.010071, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.01532, "value_mse_loss_layer_005": 0.016479, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.019653, "value_mse_loss_layer_008": 0.023071, "value_mse_loss_layer_009": 0.029663, "value_mse_loss_layer_010": 0.028564, "value_mse_loss_layer_011": 0.027466, "value_mse_loss_layer_012": 0.029053, "value_mse_loss_layer_013": 0.03064, "value_mse_loss_layer_014": 0.033691, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.0271, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.03418, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.037354, "value_mse_loss_layer_021": 0.043945, "value_mse_loss_layer_022": 0.041748, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.050293, "value_mse_loss_layer_025": 0.06543, "value_mse_loss_layer_026": 0.057861, "value_mse_loss_layer_027": 0.080078, "value_mse_loss_layer_028": 0.073242, "value_mse_loss_layer_029": 0.106445, "value_mse_loss_layer_030": 0.094727, "value_mse_loss_layer_031": 0.124512, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.8e-05, "vq_loss_layer_002": 3.3e-05, "vq_loss_layer_003": 5.5e-05, "vq_loss_layer_004": 0.000102, "vq_loss_layer_005": 0.000186, "vq_loss_layer_006": 0.000188, "vq_loss_layer_007": 0.000298, "vq_loss_layer_008": 0.00038, "vq_loss_layer_009": 0.000423, "vq_loss_layer_010": 0.000538, "vq_loss_layer_011": 0.000406, "vq_loss_layer_012": 0.000648, "vq_loss_layer_013": 0.000591, "vq_loss_layer_014": 0.000881, "vq_loss_layer_015": 0.000916, "vq_loss_layer_016": 0.000645, "vq_loss_layer_017": 0.000645, "vq_loss_layer_018": 0.000504, "vq_loss_layer_019": 0.000278, "vq_loss_layer_020": 0.000399, "vq_loss_layer_021": 0.000786, "vq_loss_layer_022": 0.000511, "vq_loss_layer_023": 0.000591, "vq_loss_layer_024": 0.000504, "vq_loss_layer_025": 0.000755, "vq_loss_layer_026": 0.00095, "vq_loss_layer_027": 0.001183, "vq_loss_layer_028": 0.001595, "vq_loss_layer_029": 0.003021, "vq_loss_layer_030": 0.00386, "vq_loss_layer_031": 0.010559 }, { "ce_loss": 2.297121, "epoch": 0.00251, "grad_norm": 0.006557197775691748, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.058942, "kv_vq_loss": 0.000778, "learning_rate": 0.0008499184303702594, "loss": 0.059732, "step": 2510, "value_mse_loss_layer_000": 0.000854, "value_mse_loss_layer_001": 0.002426, "value_mse_loss_layer_002": 0.012085, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.013977, "value_mse_loss_layer_005": 0.014099, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.018799, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.02832, "value_mse_loss_layer_010": 0.025146, "value_mse_loss_layer_011": 0.026733, "value_mse_loss_layer_012": 0.028198, "value_mse_loss_layer_013": 0.028809, "value_mse_loss_layer_014": 0.029297, "value_mse_loss_layer_015": 0.032227, "value_mse_loss_layer_016": 0.032715, "value_mse_loss_layer_017": 0.033691, "value_mse_loss_layer_018": 0.030396, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.03833, "value_mse_loss_layer_021": 0.047119, "value_mse_loss_layer_022": 0.041504, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.05542, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.05249, "value_mse_loss_layer_027": 0.075195, "value_mse_loss_layer_028": 0.088379, "value_mse_loss_layer_029": 0.120117, "value_mse_loss_layer_030": 0.092773, "value_mse_loss_layer_031": 0.114746, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000207, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.00033, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000368, "vq_loss_layer_012": 0.000618, "vq_loss_layer_013": 0.000456, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000687, "vq_loss_layer_017": 0.000557, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.000584, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.000402, "vq_loss_layer_025": 0.000429, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.00082, "vq_loss_layer_028": 0.001358, "vq_loss_layer_029": 0.001862, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.006683 }, { "ce_loss": 2.27501, "epoch": 0.00252, "grad_norm": 0.0051150210201740265, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.059064, "kv_vq_loss": 0.000759, "learning_rate": 0.000850350135195386, "loss": 0.059833, "step": 2520, "value_mse_loss_layer_000": 0.000885, "value_mse_loss_layer_001": 0.002426, "value_mse_loss_layer_002": 0.010498, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.014893, "value_mse_loss_layer_005": 0.013672, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.018921, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.029297, "value_mse_loss_layer_010": 0.025513, "value_mse_loss_layer_011": 0.027466, "value_mse_loss_layer_012": 0.029175, "value_mse_loss_layer_013": 0.029907, "value_mse_loss_layer_014": 0.033447, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.028809, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.029785, "value_mse_loss_layer_019": 0.041016, "value_mse_loss_layer_020": 0.037109, "value_mse_loss_layer_021": 0.044922, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.04834, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.062988, "value_mse_loss_layer_026": 0.049072, "value_mse_loss_layer_027": 0.064941, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.101074, "value_mse_loss_layer_030": 0.108887, "value_mse_loss_layer_031": 0.113281, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 0.000118, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000225, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000256, "vq_loss_layer_009": 0.00033, "vq_loss_layer_010": 0.000313, "vq_loss_layer_011": 0.000362, "vq_loss_layer_012": 0.000622, "vq_loss_layer_013": 0.000484, "vq_loss_layer_014": 0.000648, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000565, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000324, "vq_loss_layer_020": 0.000336, "vq_loss_layer_021": 0.000641, "vq_loss_layer_022": 0.000376, "vq_loss_layer_023": 0.000546, "vq_loss_layer_024": 0.000496, "vq_loss_layer_025": 0.00053, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001877, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.007507 }, { "ce_loss": 2.237682, "epoch": 0.00253, "grad_norm": 0.003896525362506509, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.059085, "kv_vq_loss": 0.00074, "learning_rate": 0.0008507801302939544, "loss": 0.059854, "step": 2530, "value_mse_loss_layer_000": 0.00087, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.009766, "value_mse_loss_layer_003": 0.016357, "value_mse_loss_layer_004": 0.01416, "value_mse_loss_layer_005": 0.015198, "value_mse_loss_layer_006": 0.016479, "value_mse_loss_layer_007": 0.018311, "value_mse_loss_layer_008": 0.021973, "value_mse_loss_layer_009": 0.029053, "value_mse_loss_layer_010": 0.024902, "value_mse_loss_layer_011": 0.026367, "value_mse_loss_layer_012": 0.026733, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.029785, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.027222, "value_mse_loss_layer_017": 0.03064, "value_mse_loss_layer_018": 0.029053, "value_mse_loss_layer_019": 0.033936, "value_mse_loss_layer_020": 0.037354, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.046143, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.055176, "value_mse_loss_layer_027": 0.070312, "value_mse_loss_layer_028": 0.069824, "value_mse_loss_layer_029": 0.102051, "value_mse_loss_layer_030": 0.094727, "value_mse_loss_layer_031": 0.117676, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 0.000133, "vq_loss_layer_006": 0.00018, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000301, "vq_loss_layer_009": 0.000422, "vq_loss_layer_010": 0.000336, "vq_loss_layer_011": 0.000389, "vq_loss_layer_012": 0.000549, "vq_loss_layer_013": 0.000463, "vq_loss_layer_014": 0.000591, "vq_loss_layer_015": 0.000629, "vq_loss_layer_016": 0.000572, "vq_loss_layer_017": 0.000471, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.000257, "vq_loss_layer_020": 0.000298, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000515, "vq_loss_layer_023": 0.000414, "vq_loss_layer_024": 0.000443, "vq_loss_layer_025": 0.00053, "vq_loss_layer_026": 0.000805, "vq_loss_layer_027": 0.00079, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001953, "vq_loss_layer_030": 0.002838, "vq_loss_layer_031": 0.008057 }, { "ce_loss": 2.291317, "epoch": 0.00254, "grad_norm": 0.004637246951460838, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.058679, "kv_vq_loss": 0.000731, "learning_rate": 0.0008512084291549844, "loss": 0.05943, "step": 2540, "value_mse_loss_layer_000": 0.000877, "value_mse_loss_layer_001": 0.002487, "value_mse_loss_layer_002": 0.009583, "value_mse_loss_layer_003": 0.014709, "value_mse_loss_layer_004": 0.013794, "value_mse_loss_layer_005": 0.013733, "value_mse_loss_layer_006": 0.018555, "value_mse_loss_layer_007": 0.018677, "value_mse_loss_layer_008": 0.021362, "value_mse_loss_layer_009": 0.029053, "value_mse_loss_layer_010": 0.025269, "value_mse_loss_layer_011": 0.026733, "value_mse_loss_layer_012": 0.027588, "value_mse_loss_layer_013": 0.028564, "value_mse_loss_layer_014": 0.029053, "value_mse_loss_layer_015": 0.031494, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.032715, "value_mse_loss_layer_018": 0.028198, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.0354, "value_mse_loss_layer_021": 0.042969, "value_mse_loss_layer_022": 0.039795, "value_mse_loss_layer_023": 0.042236, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.062012, "value_mse_loss_layer_026": 0.045654, "value_mse_loss_layer_027": 0.059814, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.091797, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.114258, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 0.000102, "vq_loss_layer_005": 0.000114, "vq_loss_layer_006": 0.000286, "vq_loss_layer_007": 0.000282, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000381, "vq_loss_layer_010": 0.000353, "vq_loss_layer_011": 0.000381, "vq_loss_layer_012": 0.000675, "vq_loss_layer_013": 0.000483, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000637, "vq_loss_layer_016": 0.000549, "vq_loss_layer_017": 0.000652, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000275, "vq_loss_layer_020": 0.000391, "vq_loss_layer_021": 0.000694, "vq_loss_layer_022": 0.000479, "vq_loss_layer_023": 0.000473, "vq_loss_layer_024": 0.000393, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.001457, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.00769 }, { "ce_loss": 2.308062, "epoch": 0.00255, "grad_norm": 0.005123494658619165, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.061279, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.059189, "kv_vq_loss": 0.000775, "learning_rate": 0.0008516350451084887, "loss": 0.059967, "step": 2550, "value_mse_loss_layer_000": 0.000839, "value_mse_loss_layer_001": 0.002411, "value_mse_loss_layer_002": 0.010864, "value_mse_loss_layer_003": 0.014709, "value_mse_loss_layer_004": 0.013123, "value_mse_loss_layer_005": 0.013855, "value_mse_loss_layer_006": 0.0177, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.027832, "value_mse_loss_layer_010": 0.02832, "value_mse_loss_layer_011": 0.026001, "value_mse_loss_layer_012": 0.028076, "value_mse_loss_layer_013": 0.02771, "value_mse_loss_layer_014": 0.03064, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.028687, "value_mse_loss_layer_017": 0.031128, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.035645, "value_mse_loss_layer_021": 0.043945, "value_mse_loss_layer_022": 0.040283, "value_mse_loss_layer_023": 0.050537, "value_mse_loss_layer_024": 0.048828, "value_mse_loss_layer_025": 0.064941, "value_mse_loss_layer_026": 0.062256, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.111816, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000108, "vq_loss_layer_006": 0.000231, "vq_loss_layer_007": 0.00028, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000324, "vq_loss_layer_010": 0.000359, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000629, "vq_loss_layer_013": 0.000429, "vq_loss_layer_014": 0.000622, "vq_loss_layer_015": 0.000626, "vq_loss_layer_016": 0.00058, "vq_loss_layer_017": 0.000484, "vq_loss_layer_018": 0.000273, "vq_loss_layer_019": 0.000221, "vq_loss_layer_020": 0.00028, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000467, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.000896, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.006073 }, { "ce_loss": 2.248108, "epoch": 0.00256, "grad_norm": 0.004625450354069471, "key_mse_loss_layer_000": 0.003601, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.061523, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.059113, "kv_vq_loss": 0.000758, "learning_rate": 0.0008520599913279623, "loss": 0.059888, "step": 2560, "value_mse_loss_layer_000": 0.000854, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.010315, "value_mse_loss_layer_003": 0.018188, "value_mse_loss_layer_004": 0.017578, "value_mse_loss_layer_005": 0.014893, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.023071, "value_mse_loss_layer_009": 0.027832, "value_mse_loss_layer_010": 0.024414, "value_mse_loss_layer_011": 0.026123, "value_mse_loss_layer_012": 0.026733, "value_mse_loss_layer_013": 0.027954, "value_mse_loss_layer_014": 0.028931, "value_mse_loss_layer_015": 0.029785, "value_mse_loss_layer_016": 0.028442, "value_mse_loss_layer_017": 0.03064, "value_mse_loss_layer_018": 0.037598, "value_mse_loss_layer_019": 0.034668, "value_mse_loss_layer_020": 0.038574, "value_mse_loss_layer_021": 0.044678, "value_mse_loss_layer_022": 0.042725, "value_mse_loss_layer_023": 0.050293, "value_mse_loss_layer_024": 0.058105, "value_mse_loss_layer_025": 0.072754, "value_mse_loss_layer_026": 0.068359, "value_mse_loss_layer_027": 0.082031, "value_mse_loss_layer_028": 0.082031, "value_mse_loss_layer_029": 0.125977, "value_mse_loss_layer_030": 0.11084, "value_mse_loss_layer_031": 0.133789, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.5e-05, "vq_loss_layer_002": 2.7e-05, "vq_loss_layer_003": 6.3e-05, "vq_loss_layer_004": 0.000176, "vq_loss_layer_005": 0.000119, "vq_loss_layer_006": 0.000221, "vq_loss_layer_007": 0.000296, "vq_loss_layer_008": 0.000368, "vq_loss_layer_009": 0.000383, "vq_loss_layer_010": 0.000368, "vq_loss_layer_011": 0.000425, "vq_loss_layer_012": 0.00058, "vq_loss_layer_013": 0.000511, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000641, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.000706, "vq_loss_layer_019": 0.000336, "vq_loss_layer_020": 0.000444, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.000397, "vq_loss_layer_023": 0.000381, "vq_loss_layer_024": 0.000675, "vq_loss_layer_025": 0.000664, "vq_loss_layer_026": 0.001305, "vq_loss_layer_027": 0.001274, "vq_loss_layer_028": 0.001686, "vq_loss_layer_029": 0.003235, "vq_loss_layer_030": 0.003937, "vq_loss_layer_031": 0.010376 }, { "ce_loss": 2.277342, "epoch": 0.00257, "grad_norm": 0.005399330519139767, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.080566, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.05892, "kv_vq_loss": 0.000771, "learning_rate": 0.0008524832808328235, "loss": 0.059702, "step": 2570, "value_mse_loss_layer_000": 0.000843, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.009705, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.014282, "value_mse_loss_layer_005": 0.014343, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.019165, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.029175, "value_mse_loss_layer_010": 0.025269, "value_mse_loss_layer_011": 0.027344, "value_mse_loss_layer_012": 0.031494, "value_mse_loss_layer_013": 0.028564, "value_mse_loss_layer_014": 0.030029, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.02771, "value_mse_loss_layer_017": 0.029907, "value_mse_loss_layer_018": 0.03064, "value_mse_loss_layer_019": 0.035156, "value_mse_loss_layer_020": 0.035889, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.060303, "value_mse_loss_layer_024": 0.057617, "value_mse_loss_layer_025": 0.072754, "value_mse_loss_layer_026": 0.052002, "value_mse_loss_layer_027": 0.070312, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.100098, "value_mse_loss_layer_030": 0.096191, "value_mse_loss_layer_031": 0.118164, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 9.8e-05, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.00017, "vq_loss_layer_007": 0.000292, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000381, "vq_loss_layer_010": 0.00032, "vq_loss_layer_011": 0.000359, "vq_loss_layer_012": 0.000824, "vq_loss_layer_013": 0.000465, "vq_loss_layer_014": 0.000599, "vq_loss_layer_015": 0.000668, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000278, "vq_loss_layer_020": 0.000345, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000607, "vq_loss_layer_024": 0.000463, "vq_loss_layer_025": 0.000618, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000767, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001831, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.007233 }, { "ce_loss": 2.275428, "epoch": 0.00258, "grad_norm": 0.004926643334329128, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.058914, "kv_vq_loss": 0.000753, "learning_rate": 0.0008529049264908074, "loss": 0.059683, "step": 2580, "value_mse_loss_layer_000": 0.000828, "value_mse_loss_layer_001": 0.002411, "value_mse_loss_layer_002": 0.01001, "value_mse_loss_layer_003": 0.015564, "value_mse_loss_layer_004": 0.014771, "value_mse_loss_layer_005": 0.014404, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.019897, "value_mse_loss_layer_008": 0.021851, "value_mse_loss_layer_009": 0.027954, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.026978, "value_mse_loss_layer_012": 0.027832, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.028931, "value_mse_loss_layer_015": 0.030029, "value_mse_loss_layer_016": 0.028931, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.029297, "value_mse_loss_layer_019": 0.033691, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.044434, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.052734, "value_mse_loss_layer_024": 0.053711, "value_mse_loss_layer_025": 0.072754, "value_mse_loss_layer_026": 0.056152, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.074707, "value_mse_loss_layer_029": 0.10498, "value_mse_loss_layer_030": 0.105469, "value_mse_loss_layer_031": 0.116211, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 0.000128, "vq_loss_layer_006": 0.000235, "vq_loss_layer_007": 0.000322, "vq_loss_layer_008": 0.000319, "vq_loss_layer_009": 0.000372, "vq_loss_layer_010": 0.00041, "vq_loss_layer_011": 0.000463, "vq_loss_layer_012": 0.000679, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000641, "vq_loss_layer_016": 0.000694, "vq_loss_layer_017": 0.000664, "vq_loss_layer_018": 0.000374, "vq_loss_layer_019": 0.000305, "vq_loss_layer_020": 0.00028, "vq_loss_layer_021": 0.000618, "vq_loss_layer_022": 0.000549, "vq_loss_layer_023": 0.000492, "vq_loss_layer_024": 0.000519, "vq_loss_layer_025": 0.000801, "vq_loss_layer_026": 0.000931, "vq_loss_layer_027": 0.001053, "vq_loss_layer_028": 0.001564, "vq_loss_layer_029": 0.002457, "vq_loss_layer_030": 0.004242, "vq_loss_layer_031": 0.008667 }, { "ce_loss": 2.29682, "epoch": 0.00259, "grad_norm": 0.0047110156156122684, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.058731, "kv_vq_loss": 0.000752, "learning_rate": 0.0008533249410203128, "loss": 0.059494, "step": 2590, "value_mse_loss_layer_000": 0.000881, "value_mse_loss_layer_001": 0.002426, "value_mse_loss_layer_002": 0.009888, "value_mse_loss_layer_003": 0.015625, "value_mse_loss_layer_004": 0.015503, "value_mse_loss_layer_005": 0.014587, "value_mse_loss_layer_006": 0.016968, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.022461, "value_mse_loss_layer_009": 0.030396, "value_mse_loss_layer_010": 0.025146, "value_mse_loss_layer_011": 0.026489, "value_mse_loss_layer_012": 0.0271, "value_mse_loss_layer_013": 0.030518, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.030396, "value_mse_loss_layer_017": 0.032471, "value_mse_loss_layer_018": 0.03064, "value_mse_loss_layer_019": 0.035156, "value_mse_loss_layer_020": 0.035889, "value_mse_loss_layer_021": 0.046875, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.048096, "value_mse_loss_layer_024": 0.055664, "value_mse_loss_layer_025": 0.066895, "value_mse_loss_layer_026": 0.054932, "value_mse_loss_layer_027": 0.076172, "value_mse_loss_layer_028": 0.073242, "value_mse_loss_layer_029": 0.117676, "value_mse_loss_layer_030": 0.092285, "value_mse_loss_layer_031": 0.119629, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 0.000125, "vq_loss_layer_005": 0.000114, "vq_loss_layer_006": 0.00019, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000292, "vq_loss_layer_009": 0.000444, "vq_loss_layer_010": 0.000328, "vq_loss_layer_011": 0.000353, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.000561, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000702, "vq_loss_layer_017": 0.000479, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000648, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000444, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000805, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001831, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.007416 }, { "ce_loss": 2.289786, "epoch": 0.0026, "grad_norm": 0.004716852214187384, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.05957, "kv_mse_loss": 0.058798, "kv_vq_loss": 0.000791, "learning_rate": 0.0008537433369927044, "loss": 0.059586, "step": 2600, "value_mse_loss_layer_000": 0.000843, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.009583, "value_mse_loss_layer_003": 0.015503, "value_mse_loss_layer_004": 0.015015, "value_mse_loss_layer_005": 0.014526, "value_mse_loss_layer_006": 0.017334, "value_mse_loss_layer_007": 0.018433, "value_mse_loss_layer_008": 0.022217, "value_mse_loss_layer_009": 0.02832, "value_mse_loss_layer_010": 0.023804, "value_mse_loss_layer_011": 0.025879, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.030396, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.027588, "value_mse_loss_layer_017": 0.03125, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.036133, "value_mse_loss_layer_021": 0.045166, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.046875, "value_mse_loss_layer_024": 0.051758, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.050537, "value_mse_loss_layer_027": 0.068359, "value_mse_loss_layer_028": 0.074707, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.099121, "value_mse_loss_layer_031": 0.117676, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 9.9e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000226, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.000277, "vq_loss_layer_009": 0.000324, "vq_loss_layer_010": 0.000303, "vq_loss_layer_011": 0.000359, "vq_loss_layer_012": 0.000595, "vq_loss_layer_013": 0.000433, "vq_loss_layer_014": 0.000599, "vq_loss_layer_015": 0.000603, "vq_loss_layer_016": 0.000565, "vq_loss_layer_017": 0.000492, "vq_loss_layer_018": 0.000288, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000687, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000443, "vq_loss_layer_024": 0.000444, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000786, "vq_loss_layer_028": 0.001335, "vq_loss_layer_029": 0.001793, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.008667 }, { "ce_loss": 2.326334, "epoch": 0.00261, "grad_norm": 0.00412340322509408, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.058511, "kv_vq_loss": 0.000741, "learning_rate": 0.0008541601268345702, "loss": 0.059271, "step": 2610, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014404, "value_mse_loss_layer_004": 0.013123, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.015991, "value_mse_loss_layer_007": 0.019409, "value_mse_loss_layer_008": 0.021606, "value_mse_loss_layer_009": 0.031128, "value_mse_loss_layer_010": 0.024902, "value_mse_loss_layer_011": 0.026611, "value_mse_loss_layer_012": 0.027222, "value_mse_loss_layer_013": 0.02771, "value_mse_loss_layer_014": 0.029297, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.031738, "value_mse_loss_layer_019": 0.033936, "value_mse_loss_layer_020": 0.037598, "value_mse_loss_layer_021": 0.047607, "value_mse_loss_layer_022": 0.040039, "value_mse_loss_layer_023": 0.049072, "value_mse_loss_layer_024": 0.048096, "value_mse_loss_layer_025": 0.066895, "value_mse_loss_layer_026": 0.051025, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.098145, "value_mse_loss_layer_030": 0.091797, "value_mse_loss_layer_031": 0.110352, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000167, "vq_loss_layer_007": 0.000362, "vq_loss_layer_008": 0.000267, "vq_loss_layer_009": 0.000526, "vq_loss_layer_010": 0.000317, "vq_loss_layer_011": 0.000364, "vq_loss_layer_012": 0.000633, "vq_loss_layer_013": 0.000431, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.000622, "vq_loss_layer_016": 0.000507, "vq_loss_layer_017": 0.000565, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.00032, "vq_loss_layer_021": 0.000614, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000576, "vq_loss_layer_024": 0.00033, "vq_loss_layer_025": 0.000435, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001457, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.006165 }, { "ce_loss": 2.209431, "epoch": 0.00262, "grad_norm": 0.0040512243285775185, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.099121, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.059387, "kv_vq_loss": 0.000782, "learning_rate": 0.0008545753228299362, "loss": 0.06019, "step": 2620, "value_mse_loss_layer_000": 0.000832, "value_mse_loss_layer_001": 0.002396, "value_mse_loss_layer_002": 0.009399, "value_mse_loss_layer_003": 0.014954, "value_mse_loss_layer_004": 0.013733, "value_mse_loss_layer_005": 0.013916, "value_mse_loss_layer_006": 0.016724, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.022949, "value_mse_loss_layer_011": 0.025635, "value_mse_loss_layer_012": 0.030029, "value_mse_loss_layer_013": 0.028931, "value_mse_loss_layer_014": 0.028442, "value_mse_loss_layer_015": 0.029297, "value_mse_loss_layer_016": 0.0271, "value_mse_loss_layer_017": 0.027954, "value_mse_loss_layer_018": 0.031006, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.040527, "value_mse_loss_layer_023": 0.042969, "value_mse_loss_layer_024": 0.047363, "value_mse_loss_layer_025": 0.057861, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.06543, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.095703, "value_mse_loss_layer_030": 0.089355, "value_mse_loss_layer_031": 0.115723, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 9.9e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000196, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000294, "vq_loss_layer_009": 0.00041, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000427, "vq_loss_layer_012": 0.00079, "vq_loss_layer_013": 0.000534, "vq_loss_layer_014": 0.000618, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000595, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.000229, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000347, "vq_loss_layer_023": 0.000378, "vq_loss_layer_024": 0.00036, "vq_loss_layer_025": 0.000452, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.006836 }, { "ce_loss": 2.268631, "epoch": 0.00263, "grad_norm": 0.007356223650276661, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.044434, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.055176, "key_mse_loss_layer_006": 0.062012, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.059137, "kv_vq_loss": 0.000791, "learning_rate": 0.0008549889371224394, "loss": 0.059933, "step": 2630, "value_mse_loss_layer_000": 0.000854, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014893, "value_mse_loss_layer_004": 0.014038, "value_mse_loss_layer_005": 0.014221, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.018677, "value_mse_loss_layer_008": 0.021606, "value_mse_loss_layer_009": 0.027344, "value_mse_loss_layer_010": 0.023682, "value_mse_loss_layer_011": 0.02478, "value_mse_loss_layer_012": 0.025879, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.02832, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.0271, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.026855, "value_mse_loss_layer_019": 0.032471, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.051514, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.047119, "value_mse_loss_layer_025": 0.057861, "value_mse_loss_layer_026": 0.051514, "value_mse_loss_layer_027": 0.072754, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.124023, "value_mse_loss_layer_030": 0.118164, "value_mse_loss_layer_031": 0.116211, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000177, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000298, "vq_loss_layer_009": 0.000343, "vq_loss_layer_010": 0.000322, "vq_loss_layer_011": 0.000349, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.000622, "vq_loss_layer_015": 0.00079, "vq_loss_layer_016": 0.00074, "vq_loss_layer_017": 0.000626, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000288, "vq_loss_layer_020": 0.000334, "vq_loss_layer_021": 0.000908, "vq_loss_layer_022": 0.000402, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.000406, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.000736, "vq_loss_layer_027": 0.000977, "vq_loss_layer_028": 0.001472, "vq_loss_layer_029": 0.002258, "vq_loss_layer_030": 0.003387, "vq_loss_layer_031": 0.008606 }, { "ce_loss": 2.320411, "epoch": 0.00264, "grad_norm": 0.004715586081147194, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.058801, "kv_vq_loss": 0.000749, "learning_rate": 0.0008554009817174576, "loss": 0.059576, "step": 2640, "value_mse_loss_layer_000": 0.00087, "value_mse_loss_layer_001": 0.002365, "value_mse_loss_layer_002": 0.009644, "value_mse_loss_layer_003": 0.015076, "value_mse_loss_layer_004": 0.014404, "value_mse_loss_layer_005": 0.013672, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.029297, "value_mse_loss_layer_010": 0.023804, "value_mse_loss_layer_011": 0.025024, "value_mse_loss_layer_012": 0.026001, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.027832, "value_mse_loss_layer_015": 0.031006, "value_mse_loss_layer_016": 0.026855, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.030884, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.042969, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.040039, "value_mse_loss_layer_023": 0.055176, "value_mse_loss_layer_024": 0.046631, "value_mse_loss_layer_025": 0.063965, "value_mse_loss_layer_026": 0.049072, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.098145, "value_mse_loss_layer_030": 0.09375, "value_mse_loss_layer_031": 0.108398, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 9.8e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.00042, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000324, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000397, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000458, "vq_loss_layer_018": 0.000328, "vq_loss_layer_019": 0.000225, "vq_loss_layer_020": 0.000366, "vq_loss_layer_021": 0.000572, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000645, "vq_loss_layer_024": 0.000341, "vq_loss_layer_025": 0.000479, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000721, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001495, "vq_loss_layer_030": 0.003189, "vq_loss_layer_031": 0.006927 }, { "ce_loss": 2.273006, "epoch": 0.00265, "grad_norm": 0.005073913373053074, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.052246, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.058453, "kv_vq_loss": 0.000766, "learning_rate": 0.0008558114684842019, "loss": 0.059225, "step": 2650, "value_mse_loss_layer_000": 0.000835, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.009888, "value_mse_loss_layer_003": 0.015991, "value_mse_loss_layer_004": 0.013733, "value_mse_loss_layer_005": 0.014587, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.021973, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.02356, "value_mse_loss_layer_011": 0.025879, "value_mse_loss_layer_012": 0.026123, "value_mse_loss_layer_013": 0.027222, "value_mse_loss_layer_014": 0.030762, "value_mse_loss_layer_015": 0.031006, "value_mse_loss_layer_016": 0.0271, "value_mse_loss_layer_017": 0.030518, "value_mse_loss_layer_018": 0.028809, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.039062, "value_mse_loss_layer_021": 0.044189, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.048828, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.069336, "value_mse_loss_layer_026": 0.057129, "value_mse_loss_layer_027": 0.071289, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.109375, "value_mse_loss_layer_030": 0.09375, "value_mse_loss_layer_031": 0.11377, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 0.000123, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000256, "vq_loss_layer_008": 0.000299, "vq_loss_layer_009": 0.000313, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000362, "vq_loss_layer_012": 0.000538, "vq_loss_layer_013": 0.000429, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000622, "vq_loss_layer_016": 0.000568, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.00024, "vq_loss_layer_020": 0.00034, "vq_loss_layer_021": 0.000557, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000538, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000484, "vq_loss_layer_026": 0.000778, "vq_loss_layer_027": 0.000725, "vq_loss_layer_028": 0.00106, "vq_loss_layer_029": 0.001999, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.007202 }, { "ce_loss": 2.31542, "epoch": 0.00266, "grad_norm": 0.00526758749037981, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.058728, "kv_vq_loss": 0.000749, "learning_rate": 0.0008562204091577667, "loss": 0.059491, "step": 2660, "value_mse_loss_layer_000": 0.000851, "value_mse_loss_layer_001": 0.002396, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.014954, "value_mse_loss_layer_005": 0.013672, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.021973, "value_mse_loss_layer_009": 0.027954, "value_mse_loss_layer_010": 0.023071, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.028076, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.028198, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.027954, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.03125, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.035889, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.040527, "value_mse_loss_layer_023": 0.046143, "value_mse_loss_layer_024": 0.050049, "value_mse_loss_layer_025": 0.068848, "value_mse_loss_layer_026": 0.053223, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.081055, "value_mse_loss_layer_029": 0.106445, "value_mse_loss_layer_030": 0.09082, "value_mse_loss_layer_031": 0.110352, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.00013, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000188, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000376, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000305, "vq_loss_layer_012": 0.000687, "vq_loss_layer_013": 0.000429, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000565, "vq_loss_layer_017": 0.000553, "vq_loss_layer_018": 0.000317, "vq_loss_layer_019": 0.000219, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000467, "vq_loss_layer_022": 0.000303, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.00033, "vq_loss_layer_025": 0.000433, "vq_loss_layer_026": 0.000671, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.006561 }, { "ce_loss": 2.312426, "epoch": 0.00267, "grad_norm": 0.005194749217480421, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.05896, "kv_vq_loss": 0.000744, "learning_rate": 0.0008566278153411438, "loss": 0.059732, "step": 2670, "value_mse_loss_layer_000": 0.00082, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.015442, "value_mse_loss_layer_004": 0.014221, "value_mse_loss_layer_005": 0.014526, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.018311, "value_mse_loss_layer_008": 0.022461, "value_mse_loss_layer_009": 0.027344, "value_mse_loss_layer_010": 0.023438, "value_mse_loss_layer_011": 0.025269, "value_mse_loss_layer_012": 0.027222, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.029175, "value_mse_loss_layer_015": 0.03064, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.029175, "value_mse_loss_layer_018": 0.029785, "value_mse_loss_layer_019": 0.0354, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.042969, "value_mse_loss_layer_022": 0.042236, "value_mse_loss_layer_023": 0.054443, "value_mse_loss_layer_024": 0.051025, "value_mse_loss_layer_025": 0.063965, "value_mse_loss_layer_026": 0.057129, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.071777, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.123047, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 0.000123, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000256, "vq_loss_layer_008": 0.000315, "vq_loss_layer_009": 0.000362, "vq_loss_layer_010": 0.000315, "vq_loss_layer_011": 0.000345, "vq_loss_layer_012": 0.000561, "vq_loss_layer_013": 0.000422, "vq_loss_layer_014": 0.000572, "vq_loss_layer_015": 0.000607, "vq_loss_layer_016": 0.000557, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000307, "vq_loss_layer_019": 0.000261, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000359, "vq_loss_layer_023": 0.000523, "vq_loss_layer_024": 0.000381, "vq_loss_layer_025": 0.000479, "vq_loss_layer_026": 0.000824, "vq_loss_layer_027": 0.000755, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.001801, "vq_loss_layer_030": 0.003494, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.317455, "epoch": 0.00268, "grad_norm": 0.005571083165705204, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.058923, "kv_vq_loss": 0.000746, "learning_rate": 0.0008570336985071972, "loss": 0.059695, "step": 2680, "value_mse_loss_layer_000": 0.000839, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.015137, "value_mse_loss_layer_004": 0.013428, "value_mse_loss_layer_005": 0.014099, "value_mse_loss_layer_006": 0.016479, "value_mse_loss_layer_007": 0.018799, "value_mse_loss_layer_008": 0.021851, "value_mse_loss_layer_009": 0.028198, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.027344, "value_mse_loss_layer_012": 0.027588, "value_mse_loss_layer_013": 0.028442, "value_mse_loss_layer_014": 0.034668, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.027466, "value_mse_loss_layer_017": 0.030151, "value_mse_loss_layer_018": 0.028809, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.039307, "value_mse_loss_layer_021": 0.040771, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.04834, "value_mse_loss_layer_024": 0.048096, "value_mse_loss_layer_025": 0.06543, "value_mse_loss_layer_026": 0.050781, "value_mse_loss_layer_027": 0.073242, "value_mse_loss_layer_028": 0.069824, "value_mse_loss_layer_029": 0.121094, "value_mse_loss_layer_030": 0.095703, "value_mse_loss_layer_031": 0.109375, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000183, "vq_loss_layer_007": 0.000319, "vq_loss_layer_008": 0.000305, "vq_loss_layer_009": 0.000372, "vq_loss_layer_010": 0.000338, "vq_loss_layer_011": 0.000463, "vq_loss_layer_012": 0.000576, "vq_loss_layer_013": 0.000534, "vq_loss_layer_014": 0.000946, "vq_loss_layer_015": 0.000645, "vq_loss_layer_016": 0.000599, "vq_loss_layer_017": 0.0005, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000387, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000347, "vq_loss_layer_023": 0.000412, "vq_loss_layer_024": 0.000341, "vq_loss_layer_025": 0.000435, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000835, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001984, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.006287 }, { "ce_loss": 2.28976, "epoch": 0.00269, "grad_norm": 0.005216309335082769, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.059128, "kv_vq_loss": 0.000745, "learning_rate": 0.000857438070000602, "loss": 0.059891, "step": 2690, "value_mse_loss_layer_000": 0.000832, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.010498, "value_mse_loss_layer_003": 0.015747, "value_mse_loss_layer_004": 0.013306, "value_mse_loss_layer_005": 0.013916, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.022339, "value_mse_loss_layer_009": 0.028687, "value_mse_loss_layer_010": 0.024048, "value_mse_loss_layer_011": 0.025757, "value_mse_loss_layer_012": 0.026733, "value_mse_loss_layer_013": 0.028809, "value_mse_loss_layer_014": 0.028809, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.026855, "value_mse_loss_layer_017": 0.030273, "value_mse_loss_layer_018": 0.028809, "value_mse_loss_layer_019": 0.033203, "value_mse_loss_layer_020": 0.036865, "value_mse_loss_layer_021": 0.041504, "value_mse_loss_layer_022": 0.041748, "value_mse_loss_layer_023": 0.046143, "value_mse_loss_layer_024": 0.054443, "value_mse_loss_layer_025": 0.067383, "value_mse_loss_layer_026": 0.051758, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.098633, "value_mse_loss_layer_030": 0.09375, "value_mse_loss_layer_031": 0.111328, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000115, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.000282, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.000389, "vq_loss_layer_010": 0.000305, "vq_loss_layer_011": 0.000345, "vq_loss_layer_012": 0.000561, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000572, "vq_loss_layer_017": 0.000526, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000362, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000397, "vq_loss_layer_023": 0.000439, "vq_loss_layer_024": 0.000454, "vq_loss_layer_025": 0.000507, "vq_loss_layer_026": 0.000729, "vq_loss_layer_027": 0.000732, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.001694, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.006622 }, { "ce_loss": 2.31871, "epoch": 0.0027, "grad_norm": 0.004559903405606747, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.058459, "kv_vq_loss": 0.000738, "learning_rate": 0.0008578409410397467, "loss": 0.059213, "step": 2700, "value_mse_loss_layer_000": 0.000843, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.009888, "value_mse_loss_layer_003": 0.014832, "value_mse_loss_layer_004": 0.014282, "value_mse_loss_layer_005": 0.013672, "value_mse_loss_layer_006": 0.015991, "value_mse_loss_layer_007": 0.018799, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.026855, "value_mse_loss_layer_010": 0.023071, "value_mse_loss_layer_011": 0.024658, "value_mse_loss_layer_012": 0.024902, "value_mse_loss_layer_013": 0.026123, "value_mse_loss_layer_014": 0.029175, "value_mse_loss_layer_015": 0.030029, "value_mse_loss_layer_016": 0.02771, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.03125, "value_mse_loss_layer_019": 0.034912, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.042725, "value_mse_loss_layer_022": 0.042236, "value_mse_loss_layer_023": 0.049316, "value_mse_loss_layer_024": 0.054443, "value_mse_loss_layer_025": 0.069824, "value_mse_loss_layer_026": 0.055908, "value_mse_loss_layer_027": 0.072266, "value_mse_loss_layer_028": 0.07373, "value_mse_loss_layer_029": 0.115723, "value_mse_loss_layer_030": 0.098145, "value_mse_loss_layer_031": 0.116211, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 0.000102, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.00028, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000301, "vq_loss_layer_012": 0.000484, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000584, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.000303, "vq_loss_layer_019": 0.000294, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.000467, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000399, "vq_loss_layer_024": 0.000441, "vq_loss_layer_025": 0.000603, "vq_loss_layer_026": 0.000854, "vq_loss_layer_027": 0.000973, "vq_loss_layer_028": 0.001793, "vq_loss_layer_029": 0.004211, "vq_loss_layer_030": 0.004364, "vq_loss_layer_031": 0.009338 }, { "ce_loss": 2.302866, "epoch": 0.00271, "grad_norm": 0.0042917924001812935, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.058182, "kv_vq_loss": 0.000733, "learning_rate": 0.0008582423227186014, "loss": 0.058932, "step": 2710, "value_mse_loss_layer_000": 0.000858, "value_mse_loss_layer_001": 0.002396, "value_mse_loss_layer_002": 0.009338, "value_mse_loss_layer_003": 0.014832, "value_mse_loss_layer_004": 0.013733, "value_mse_loss_layer_005": 0.01355, "value_mse_loss_layer_006": 0.018799, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.021118, "value_mse_loss_layer_009": 0.028931, "value_mse_loss_layer_010": 0.023682, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.033203, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.028198, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.025757, "value_mse_loss_layer_017": 0.030029, "value_mse_loss_layer_018": 0.028442, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.0354, "value_mse_loss_layer_021": 0.042236, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.043701, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.05957, "value_mse_loss_layer_026": 0.051514, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.066406, "value_mse_loss_layer_029": 0.097168, "value_mse_loss_layer_030": 0.090332, "value_mse_loss_layer_031": 0.112305, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.000307, "vq_loss_layer_007": 0.000278, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000444, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.001068, "vq_loss_layer_013": 0.000429, "vq_loss_layer_014": 0.000572, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000429, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.000212, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000546, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.00036, "vq_loss_layer_024": 0.00037, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.000771, "vq_loss_layer_027": 0.000813, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001572, "vq_loss_layer_030": 0.00351, "vq_loss_layer_031": 0.006958 }, { "ce_loss": 2.260992, "epoch": 0.00272, "grad_norm": 0.00587945943698287, "key_mse_loss_layer_000": 0.003769, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.059421, "kv_vq_loss": 0.000759, "learning_rate": 0.0008586422260085496, "loss": 0.060196, "step": 2720, "value_mse_loss_layer_000": 0.000847, "value_mse_loss_layer_001": 0.002396, "value_mse_loss_layer_002": 0.009949, "value_mse_loss_layer_003": 0.016846, "value_mse_loss_layer_004": 0.016357, "value_mse_loss_layer_005": 0.014587, "value_mse_loss_layer_006": 0.016968, "value_mse_loss_layer_007": 0.018311, "value_mse_loss_layer_008": 0.022583, "value_mse_loss_layer_009": 0.029419, "value_mse_loss_layer_010": 0.028809, "value_mse_loss_layer_011": 0.027222, "value_mse_loss_layer_012": 0.03064, "value_mse_loss_layer_013": 0.029053, "value_mse_loss_layer_014": 0.031738, "value_mse_loss_layer_015": 0.03418, "value_mse_loss_layer_016": 0.027954, "value_mse_loss_layer_017": 0.03125, "value_mse_loss_layer_018": 0.032959, "value_mse_loss_layer_019": 0.035156, "value_mse_loss_layer_020": 0.034912, "value_mse_loss_layer_021": 0.044922, "value_mse_loss_layer_022": 0.041992, "value_mse_loss_layer_023": 0.045166, "value_mse_loss_layer_024": 0.054443, "value_mse_loss_layer_025": 0.069824, "value_mse_loss_layer_026": 0.052246, "value_mse_loss_layer_027": 0.069336, "value_mse_loss_layer_028": 0.072754, "value_mse_loss_layer_029": 0.117676, "value_mse_loss_layer_030": 0.096191, "value_mse_loss_layer_031": 0.110352, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.000191, "vq_loss_layer_005": 0.000122, "vq_loss_layer_006": 0.000198, "vq_loss_layer_007": 0.000265, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.000414, "vq_loss_layer_010": 0.000402, "vq_loss_layer_011": 0.000412, "vq_loss_layer_012": 0.000778, "vq_loss_layer_013": 0.000504, "vq_loss_layer_014": 0.000618, "vq_loss_layer_015": 0.000706, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.000526, "vq_loss_layer_018": 0.000404, "vq_loss_layer_019": 0.000288, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.000572, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000353, "vq_loss_layer_024": 0.000465, "vq_loss_layer_025": 0.000492, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000759, "vq_loss_layer_028": 0.00106, "vq_loss_layer_029": 0.00209, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.006958 }, { "ce_loss": 2.221434, "epoch": 0.00273, "grad_norm": 0.005172531120479107, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.058575, "kv_vq_loss": 0.00074, "learning_rate": 0.0008590406617601889, "loss": 0.059332, "step": 2730, "value_mse_loss_layer_000": 0.000847, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.009277, "value_mse_loss_layer_003": 0.015137, "value_mse_loss_layer_004": 0.013611, "value_mse_loss_layer_005": 0.013245, "value_mse_loss_layer_006": 0.015869, "value_mse_loss_layer_007": 0.019653, "value_mse_loss_layer_008": 0.02124, "value_mse_loss_layer_009": 0.027588, "value_mse_loss_layer_010": 0.023926, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.026245, "value_mse_loss_layer_013": 0.027222, "value_mse_loss_layer_014": 0.02832, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.028687, "value_mse_loss_layer_019": 0.033691, "value_mse_loss_layer_020": 0.0354, "value_mse_loss_layer_021": 0.041016, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.047119, "value_mse_loss_layer_024": 0.047363, "value_mse_loss_layer_025": 0.058838, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.083008, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.094727, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.105469, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.000104, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000347, "vq_loss_layer_008": 0.000239, "vq_loss_layer_009": 0.000336, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000301, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000406, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000587, "vq_loss_layer_016": 0.000475, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000236, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000479, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.00132, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.006042 }, { "ce_loss": 2.315524, "epoch": 0.00274, "grad_norm": 0.005984514020383358, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.058734, "kv_vq_loss": 0.000741, "learning_rate": 0.0008594376407050967, "loss": 0.059479, "step": 2740, "value_mse_loss_layer_000": 0.000851, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.009705, "value_mse_loss_layer_003": 0.015259, "value_mse_loss_layer_004": 0.013794, "value_mse_loss_layer_005": 0.013367, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.017944, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.028198, "value_mse_loss_layer_010": 0.023315, "value_mse_loss_layer_011": 0.026855, "value_mse_loss_layer_012": 0.026123, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.028442, "value_mse_loss_layer_015": 0.03064, "value_mse_loss_layer_016": 0.03125, "value_mse_loss_layer_017": 0.031128, "value_mse_loss_layer_018": 0.029419, "value_mse_loss_layer_019": 0.04541, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.043945, "value_mse_loss_layer_022": 0.041504, "value_mse_loss_layer_023": 0.048096, "value_mse_loss_layer_024": 0.053711, "value_mse_loss_layer_025": 0.072266, "value_mse_loss_layer_026": 0.053711, "value_mse_loss_layer_027": 0.074219, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.103516, "value_mse_loss_layer_030": 0.097168, "value_mse_loss_layer_031": 0.114258, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000207, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.000267, "vq_loss_layer_009": 0.000383, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000469, "vq_loss_layer_012": 0.000542, "vq_loss_layer_013": 0.000418, "vq_loss_layer_014": 0.000515, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000713, "vq_loss_layer_017": 0.000492, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.000324, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000565, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000423, "vq_loss_layer_024": 0.000406, "vq_loss_layer_025": 0.000484, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000809, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001572, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.007019 }, { "ce_loss": 2.291751, "epoch": 0.00275, "grad_norm": 0.004733087029308081, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.059305, "kv_vq_loss": 0.00077, "learning_rate": 0.0008598331734575655, "loss": 0.060086, "step": 2750, "value_mse_loss_layer_000": 0.000843, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.009338, "value_mse_loss_layer_003": 0.014771, "value_mse_loss_layer_004": 0.013245, "value_mse_loss_layer_005": 0.013794, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.022095, "value_mse_loss_layer_009": 0.028442, "value_mse_loss_layer_010": 0.025269, "value_mse_loss_layer_011": 0.026245, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.028564, "value_mse_loss_layer_014": 0.033936, "value_mse_loss_layer_015": 0.033203, "value_mse_loss_layer_016": 0.028564, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.037354, "value_mse_loss_layer_020": 0.04248, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.053467, "value_mse_loss_layer_024": 0.048096, "value_mse_loss_layer_025": 0.078613, "value_mse_loss_layer_026": 0.054688, "value_mse_loss_layer_027": 0.069336, "value_mse_loss_layer_028": 0.070801, "value_mse_loss_layer_029": 0.099121, "value_mse_loss_layer_030": 0.089355, "value_mse_loss_layer_031": 0.107422, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000205, "vq_loss_layer_007": 0.000261, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000355, "vq_loss_layer_010": 0.000307, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.000448, "vq_loss_layer_014": 0.000675, "vq_loss_layer_015": 0.000595, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000526, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000275, "vq_loss_layer_020": 0.000343, "vq_loss_layer_021": 0.000572, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000542, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000565, "vq_loss_layer_026": 0.000729, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001625, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.006409 }, { "ce_loss": 2.343844, "epoch": 0.00276, "grad_norm": 0.008054560050368309, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.058209, "kv_vq_loss": 0.000696, "learning_rate": 0.0008602272705163043, "loss": 0.058951, "step": 2760, "value_mse_loss_layer_000": 0.000839, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.009338, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.014526, "value_mse_loss_layer_005": 0.014099, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.027222, "value_mse_loss_layer_010": 0.023438, "value_mse_loss_layer_011": 0.025269, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.027222, "value_mse_loss_layer_014": 0.028931, "value_mse_loss_layer_015": 0.03064, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.030029, "value_mse_loss_layer_018": 0.030273, "value_mse_loss_layer_019": 0.03418, "value_mse_loss_layer_020": 0.036133, "value_mse_loss_layer_021": 0.045654, "value_mse_loss_layer_022": 0.043457, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.053955, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.057373, "value_mse_loss_layer_027": 0.069336, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.15918, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.111328, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 0.000106, "vq_loss_layer_005": 0.000108, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000317, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000412, "vq_loss_layer_014": 0.000546, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000322, "vq_loss_layer_019": 0.000267, "vq_loss_layer_020": 0.000296, "vq_loss_layer_021": 0.000576, "vq_loss_layer_022": 0.00041, "vq_loss_layer_023": 0.00045, "vq_loss_layer_024": 0.000469, "vq_loss_layer_025": 0.000507, "vq_loss_layer_026": 0.000847, "vq_loss_layer_027": 0.000751, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.00325, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.007538 }, { "ce_loss": 2.300272, "epoch": 0.00277, "grad_norm": 0.005727333016693592, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.058237, "kv_vq_loss": 0.000722, "learning_rate": 0.0008606199422661121, "loss": 0.058975, "step": 2770, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.002411, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014832, "value_mse_loss_layer_004": 0.01416, "value_mse_loss_layer_005": 0.013977, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.021362, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.023926, "value_mse_loss_layer_011": 0.025635, "value_mse_loss_layer_012": 0.025757, "value_mse_loss_layer_013": 0.027954, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.026123, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.027832, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.038818, "value_mse_loss_layer_023": 0.044434, "value_mse_loss_layer_024": 0.047852, "value_mse_loss_layer_025": 0.063477, "value_mse_loss_layer_026": 0.048828, "value_mse_loss_layer_027": 0.08252, "value_mse_loss_layer_028": 0.069824, "value_mse_loss_layer_029": 0.110352, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.110352, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 9.6e-05, "vq_loss_layer_005": 0.000114, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000267, "vq_loss_layer_009": 0.000309, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000341, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.000444, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.000687, "vq_loss_layer_016": 0.000484, "vq_loss_layer_017": 0.000706, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.000212, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000317, "vq_loss_layer_023": 0.000416, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000462, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000984, "vq_loss_layer_028": 0.001335, "vq_loss_layer_029": 0.001617, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.006622 }, { "ce_loss": 2.256481, "epoch": 0.00278, "grad_norm": 0.004834247287362814, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.058292, "kv_vq_loss": 0.000712, "learning_rate": 0.0008610111989795189, "loss": 0.059039, "step": 2780, "value_mse_loss_layer_000": 0.000832, "value_mse_loss_layer_001": 0.002365, "value_mse_loss_layer_002": 0.009216, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.013916, "value_mse_loss_layer_005": 0.01355, "value_mse_loss_layer_006": 0.015869, "value_mse_loss_layer_007": 0.0177, "value_mse_loss_layer_008": 0.022217, "value_mse_loss_layer_009": 0.027832, "value_mse_loss_layer_010": 0.024048, "value_mse_loss_layer_011": 0.025146, "value_mse_loss_layer_012": 0.025513, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.028564, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.026978, "value_mse_loss_layer_017": 0.030884, "value_mse_loss_layer_018": 0.031006, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.034912, "value_mse_loss_layer_021": 0.047119, "value_mse_loss_layer_022": 0.040527, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.048584, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.057129, "value_mse_loss_layer_027": 0.075195, "value_mse_loss_layer_028": 0.070801, "value_mse_loss_layer_029": 0.101074, "value_mse_loss_layer_030": 0.090332, "value_mse_loss_layer_031": 0.108398, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.00025, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000336, "vq_loss_layer_010": 0.000315, "vq_loss_layer_011": 0.000324, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000607, "vq_loss_layer_016": 0.000549, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.00023, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000671, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000492, "vq_loss_layer_024": 0.00033, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000786, "vq_loss_layer_027": 0.000881, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.006622 }, { "ce_loss": 2.267523, "epoch": 0.00279, "grad_norm": 0.003932879772037268, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.058466, "kv_vq_loss": 0.000729, "learning_rate": 0.0008614010508183992, "loss": 0.059225, "step": 2790, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.009644, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.013855, "value_mse_loss_layer_005": 0.015198, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.018433, "value_mse_loss_layer_008": 0.02124, "value_mse_loss_layer_009": 0.028198, "value_mse_loss_layer_010": 0.023926, "value_mse_loss_layer_011": 0.026367, "value_mse_loss_layer_012": 0.027588, "value_mse_loss_layer_013": 0.030396, "value_mse_loss_layer_014": 0.029785, "value_mse_loss_layer_015": 0.030884, "value_mse_loss_layer_016": 0.026245, "value_mse_loss_layer_017": 0.030518, "value_mse_loss_layer_018": 0.029175, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.040039, "value_mse_loss_layer_021": 0.041016, "value_mse_loss_layer_022": 0.039551, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.050049, "value_mse_loss_layer_025": 0.060059, "value_mse_loss_layer_026": 0.051514, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.067383, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.10791, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 0.000141, "vq_loss_layer_006": 0.000181, "vq_loss_layer_007": 0.000277, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000347, "vq_loss_layer_010": 0.000326, "vq_loss_layer_011": 0.000372, "vq_loss_layer_012": 0.000614, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000507, "vq_loss_layer_017": 0.000534, "vq_loss_layer_018": 0.000313, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.000538, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.00045, "vq_loss_layer_024": 0.000368, "vq_loss_layer_025": 0.000433, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001465, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.006683 }, { "ce_loss": 2.274165, "epoch": 0.0028, "grad_norm": 0.004411070141941309, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.05853, "kv_vq_loss": 0.000722, "learning_rate": 0.0008617895078355547, "loss": 0.059277, "step": 2800, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.013, "value_mse_loss_layer_005": 0.013489, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.020996, "value_mse_loss_layer_009": 0.029663, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.026245, "value_mse_loss_layer_013": 0.027466, "value_mse_loss_layer_014": 0.029297, "value_mse_loss_layer_015": 0.032471, "value_mse_loss_layer_016": 0.02771, "value_mse_loss_layer_017": 0.031006, "value_mse_loss_layer_018": 0.027466, "value_mse_loss_layer_019": 0.033203, "value_mse_loss_layer_020": 0.03418, "value_mse_loss_layer_021": 0.042969, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.045898, "value_mse_loss_layer_024": 0.047607, "value_mse_loss_layer_025": 0.058594, "value_mse_loss_layer_026": 0.050781, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.101562, "value_mse_loss_layer_030": 0.085449, "value_mse_loss_layer_031": 0.103516, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.00018, "vq_loss_layer_007": 0.000254, "vq_loss_layer_008": 0.000244, "vq_loss_layer_009": 0.00045, "vq_loss_layer_010": 0.000315, "vq_loss_layer_011": 0.000309, "vq_loss_layer_012": 0.000557, "vq_loss_layer_013": 0.00053, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000591, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000603, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.000425, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.000698, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.006226 }, { "ce_loss": 2.304347, "epoch": 0.00281, "grad_norm": 0.005357360001653433, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.05787, "kv_vq_loss": 0.000708, "learning_rate": 0.0008621765799762699, "loss": 0.058621, "step": 2810, "value_mse_loss_layer_000": 0.000843, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.009277, "value_mse_loss_layer_003": 0.016724, "value_mse_loss_layer_004": 0.014587, "value_mse_loss_layer_005": 0.01355, "value_mse_loss_layer_006": 0.015869, "value_mse_loss_layer_007": 0.018433, "value_mse_loss_layer_008": 0.022583, "value_mse_loss_layer_009": 0.026733, "value_mse_loss_layer_010": 0.023071, "value_mse_loss_layer_011": 0.024536, "value_mse_loss_layer_012": 0.02478, "value_mse_loss_layer_013": 0.026367, "value_mse_loss_layer_014": 0.027832, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.029297, "value_mse_loss_layer_018": 0.027344, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.050049, "value_mse_loss_layer_022": 0.043701, "value_mse_loss_layer_023": 0.044678, "value_mse_loss_layer_024": 0.048096, "value_mse_loss_layer_025": 0.064941, "value_mse_loss_layer_026": 0.049805, "value_mse_loss_layer_027": 0.071289, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.100098, "value_mse_loss_layer_030": 0.086426, "value_mse_loss_layer_031": 0.106934, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 0.000123, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000162, "vq_loss_layer_007": 0.000288, "vq_loss_layer_008": 0.000366, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000301, "vq_loss_layer_011": 0.000332, "vq_loss_layer_012": 0.000515, "vq_loss_layer_013": 0.000418, "vq_loss_layer_014": 0.000534, "vq_loss_layer_015": 0.000542, "vq_loss_layer_016": 0.000538, "vq_loss_layer_017": 0.000441, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000232, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000774, "vq_loss_layer_022": 0.000475, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000858, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001648, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.007294 }, { "ce_loss": 2.243048, "epoch": 0.00282, "grad_norm": 0.006544545292854309, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.058801, "kv_vq_loss": 0.000748, "learning_rate": 0.0008625622770798402, "loss": 0.059586, "step": 2820, "value_mse_loss_layer_000": 0.000816, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.009583, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.013855, "value_mse_loss_layer_005": 0.01416, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.021729, "value_mse_loss_layer_009": 0.028076, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.025391, "value_mse_loss_layer_012": 0.025635, "value_mse_loss_layer_013": 0.030029, "value_mse_loss_layer_014": 0.030151, "value_mse_loss_layer_015": 0.031494, "value_mse_loss_layer_016": 0.029175, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.030518, "value_mse_loss_layer_019": 0.031982, "value_mse_loss_layer_020": 0.034668, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.040527, "value_mse_loss_layer_023": 0.044189, "value_mse_loss_layer_024": 0.051025, "value_mse_loss_layer_025": 0.061035, "value_mse_loss_layer_026": 0.050049, "value_mse_loss_layer_027": 0.063965, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.120605, "value_mse_loss_layer_030": 0.089355, "value_mse_loss_layer_031": 0.11084, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 9.1e-05, "vq_loss_layer_005": 0.000116, "vq_loss_layer_006": 0.000174, "vq_loss_layer_007": 0.000275, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000345, "vq_loss_layer_010": 0.000372, "vq_loss_layer_011": 0.00034, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000557, "vq_loss_layer_014": 0.000641, "vq_loss_layer_015": 0.000576, "vq_loss_layer_016": 0.000645, "vq_loss_layer_017": 0.000603, "vq_loss_layer_018": 0.000322, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000296, "vq_loss_layer_021": 0.000698, "vq_loss_layer_022": 0.000463, "vq_loss_layer_023": 0.000452, "vq_loss_layer_024": 0.000469, "vq_loss_layer_025": 0.000561, "vq_loss_layer_026": 0.000809, "vq_loss_layer_027": 0.000782, "vq_loss_layer_028": 0.001389, "vq_loss_layer_029": 0.002274, "vq_loss_layer_030": 0.003448, "vq_loss_layer_031": 0.007324 }, { "ce_loss": 2.255983, "epoch": 0.00283, "grad_norm": 0.004123609513044357, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.058307, "kv_vq_loss": 0.00072, "learning_rate": 0.0008629466088810725, "loss": 0.059067, "step": 2830, "value_mse_loss_layer_000": 0.000832, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.014404, "value_mse_loss_layer_004": 0.013184, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.017944, "value_mse_loss_layer_008": 0.020874, "value_mse_loss_layer_009": 0.028442, "value_mse_loss_layer_010": 0.023804, "value_mse_loss_layer_011": 0.025146, "value_mse_loss_layer_012": 0.025269, "value_mse_loss_layer_013": 0.026611, "value_mse_loss_layer_014": 0.030151, "value_mse_loss_layer_015": 0.029663, "value_mse_loss_layer_016": 0.026855, "value_mse_loss_layer_017": 0.029907, "value_mse_loss_layer_018": 0.030029, "value_mse_loss_layer_019": 0.035645, "value_mse_loss_layer_020": 0.040039, "value_mse_loss_layer_021": 0.041504, "value_mse_loss_layer_022": 0.040283, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.050537, "value_mse_loss_layer_025": 0.069336, "value_mse_loss_layer_026": 0.052734, "value_mse_loss_layer_027": 0.069336, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.092285, "value_mse_loss_layer_031": 0.111328, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000204, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000412, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000324, "vq_loss_layer_012": 0.00053, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.000645, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000324, "vq_loss_layer_019": 0.000259, "vq_loss_layer_020": 0.000322, "vq_loss_layer_021": 0.000488, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001595, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.006165 }, { "ce_loss": 2.272816, "epoch": 0.00284, "grad_norm": 0.0042580654844641685, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.057883, "kv_vq_loss": 0.000707, "learning_rate": 0.0008633295850117594, "loss": 0.058627, "step": 2840, "value_mse_loss_layer_000": 0.000843, "value_mse_loss_layer_001": 0.002441, "value_mse_loss_layer_002": 0.009827, "value_mse_loss_layer_003": 0.016479, "value_mse_loss_layer_004": 0.015869, "value_mse_loss_layer_005": 0.014526, "value_mse_loss_layer_006": 0.016479, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.021362, "value_mse_loss_layer_009": 0.027832, "value_mse_loss_layer_010": 0.023682, "value_mse_loss_layer_011": 0.026001, "value_mse_loss_layer_012": 0.029419, "value_mse_loss_layer_013": 0.028687, "value_mse_loss_layer_014": 0.029297, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.027588, "value_mse_loss_layer_017": 0.030762, "value_mse_loss_layer_018": 0.029175, "value_mse_loss_layer_019": 0.034912, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.040527, "value_mse_loss_layer_022": 0.041504, "value_mse_loss_layer_023": 0.044189, "value_mse_loss_layer_024": 0.046143, "value_mse_loss_layer_025": 0.060059, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.094727, "value_mse_loss_layer_030": 0.087402, "value_mse_loss_layer_031": 0.106934, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000156, "vq_loss_layer_005": 0.000141, "vq_loss_layer_006": 0.000205, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.00029, "vq_loss_layer_009": 0.000336, "vq_loss_layer_010": 0.000309, "vq_loss_layer_011": 0.000381, "vq_loss_layer_012": 0.000793, "vq_loss_layer_013": 0.000481, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000736, "vq_loss_layer_016": 0.000576, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000294, "vq_loss_layer_020": 0.000328, "vq_loss_layer_021": 0.000576, "vq_loss_layer_022": 0.000393, "vq_loss_layer_023": 0.00041, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.00069, "vq_loss_layer_027": 0.000904, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.001518, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.006897 }, { "ce_loss": 2.291032, "epoch": 0.00285, "grad_norm": 0.00522254453971982, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.060547, "kv_mse_loss": 0.058768, "kv_vq_loss": 0.000763, "learning_rate": 0.0008637112150021274, "loss": 0.059537, "step": 2850, "value_mse_loss_layer_000": 0.000813, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.009399, "value_mse_loss_layer_003": 0.014709, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.014099, "value_mse_loss_layer_006": 0.016235, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.032471, "value_mse_loss_layer_010": 0.025391, "value_mse_loss_layer_011": 0.026489, "value_mse_loss_layer_012": 0.027832, "value_mse_loss_layer_013": 0.028809, "value_mse_loss_layer_014": 0.029297, "value_mse_loss_layer_015": 0.032715, "value_mse_loss_layer_016": 0.026611, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.029907, "value_mse_loss_layer_019": 0.033203, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.04541, "value_mse_loss_layer_022": 0.038818, "value_mse_loss_layer_023": 0.044189, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.055908, "value_mse_loss_layer_026": 0.044678, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.098145, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.109375, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.000248, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000637, "vq_loss_layer_010": 0.000362, "vq_loss_layer_011": 0.000383, "vq_loss_layer_012": 0.000683, "vq_loss_layer_013": 0.000471, "vq_loss_layer_014": 0.000576, "vq_loss_layer_015": 0.000679, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000546, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000286, "vq_loss_layer_020": 0.000286, "vq_loss_layer_021": 0.000725, "vq_loss_layer_022": 0.000376, "vq_loss_layer_023": 0.000496, "vq_loss_layer_024": 0.000393, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000896, "vq_loss_layer_028": 0.001129, "vq_loss_layer_029": 0.001762, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.00769 }, { "ce_loss": 2.259278, "epoch": 0.00286, "grad_norm": 0.0043663568794727325, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.058386, "kv_vq_loss": 0.000744, "learning_rate": 0.0008640915082822607, "loss": 0.059158, "step": 2860, "value_mse_loss_layer_000": 0.000835, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.009766, "value_mse_loss_layer_003": 0.014709, "value_mse_loss_layer_004": 0.013977, "value_mse_loss_layer_005": 0.013855, "value_mse_loss_layer_006": 0.016479, "value_mse_loss_layer_007": 0.019897, "value_mse_loss_layer_008": 0.021606, "value_mse_loss_layer_009": 0.027832, "value_mse_loss_layer_010": 0.023804, "value_mse_loss_layer_011": 0.025635, "value_mse_loss_layer_012": 0.026489, "value_mse_loss_layer_013": 0.0271, "value_mse_loss_layer_014": 0.029541, "value_mse_loss_layer_015": 0.030518, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.028687, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.034668, "value_mse_loss_layer_021": 0.043945, "value_mse_loss_layer_022": 0.04126, "value_mse_loss_layer_023": 0.046387, "value_mse_loss_layer_024": 0.050293, "value_mse_loss_layer_025": 0.061279, "value_mse_loss_layer_026": 0.067871, "value_mse_loss_layer_027": 0.070801, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.105469, "value_mse_loss_layer_030": 0.094238, "value_mse_loss_layer_031": 0.11377, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000182, "vq_loss_layer_007": 0.000324, "vq_loss_layer_008": 0.000309, "vq_loss_layer_009": 0.000385, "vq_loss_layer_010": 0.000326, "vq_loss_layer_011": 0.000393, "vq_loss_layer_012": 0.000591, "vq_loss_layer_013": 0.000475, "vq_loss_layer_014": 0.000599, "vq_loss_layer_015": 0.000622, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.00033, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.000296, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000389, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000404, "vq_loss_layer_025": 0.000519, "vq_loss_layer_026": 0.00116, "vq_loss_layer_027": 0.000824, "vq_loss_layer_028": 0.001297, "vq_loss_layer_029": 0.001831, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.00766 }, { "ce_loss": 2.281764, "epoch": 0.00287, "grad_norm": 0.004126475192606449, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.058017, "kv_vq_loss": 0.000718, "learning_rate": 0.0008644704741834979, "loss": 0.058765, "step": 2870, "value_mse_loss_layer_000": 0.000828, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.01001, "value_mse_loss_layer_003": 0.019287, "value_mse_loss_layer_004": 0.014893, "value_mse_loss_layer_005": 0.015137, "value_mse_loss_layer_006": 0.016479, "value_mse_loss_layer_007": 0.017944, "value_mse_loss_layer_008": 0.021362, "value_mse_loss_layer_009": 0.027344, "value_mse_loss_layer_010": 0.023438, "value_mse_loss_layer_011": 0.024414, "value_mse_loss_layer_012": 0.026489, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.029663, "value_mse_loss_layer_015": 0.031982, "value_mse_loss_layer_016": 0.025757, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.028809, "value_mse_loss_layer_019": 0.031982, "value_mse_loss_layer_020": 0.036133, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.048584, "value_mse_loss_layer_025": 0.058594, "value_mse_loss_layer_026": 0.054688, "value_mse_loss_layer_027": 0.062256, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.094238, "value_mse_loss_layer_030": 0.091797, "value_mse_loss_layer_031": 0.105469, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 6e-05, "vq_loss_layer_004": 0.000128, "vq_loss_layer_005": 0.000131, "vq_loss_layer_006": 0.000192, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.00029, "vq_loss_layer_009": 0.000326, "vq_loss_layer_010": 0.000309, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000584, "vq_loss_layer_013": 0.000456, "vq_loss_layer_014": 0.000568, "vq_loss_layer_015": 0.000786, "vq_loss_layer_016": 0.000546, "vq_loss_layer_017": 0.000504, "vq_loss_layer_018": 0.000288, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.000595, "vq_loss_layer_022": 0.000317, "vq_loss_layer_023": 0.000385, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.000483, "vq_loss_layer_026": 0.000809, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.001633, "vq_loss_layer_030": 0.00296, "vq_loss_layer_031": 0.007141 }, { "ce_loss": 2.306916, "epoch": 0.00288, "grad_norm": 0.006113471928983927, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.058124, "kv_vq_loss": 0.000713, "learning_rate": 0.0008648481219398076, "loss": 0.058862, "step": 2880, "value_mse_loss_layer_000": 0.000828, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.013794, "value_mse_loss_layer_005": 0.013794, "value_mse_loss_layer_006": 0.015747, "value_mse_loss_layer_007": 0.017822, "value_mse_loss_layer_008": 0.021606, "value_mse_loss_layer_009": 0.028076, "value_mse_loss_layer_010": 0.023682, "value_mse_loss_layer_011": 0.025269, "value_mse_loss_layer_012": 0.028564, "value_mse_loss_layer_013": 0.028687, "value_mse_loss_layer_014": 0.029053, "value_mse_loss_layer_015": 0.030884, "value_mse_loss_layer_016": 0.026123, "value_mse_loss_layer_017": 0.03064, "value_mse_loss_layer_018": 0.028198, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.050537, "value_mse_loss_layer_022": 0.041504, "value_mse_loss_layer_023": 0.055908, "value_mse_loss_layer_024": 0.052246, "value_mse_loss_layer_025": 0.05835, "value_mse_loss_layer_026": 0.052002, "value_mse_loss_layer_027": 0.074707, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.088867, "value_mse_loss_layer_031": 0.121094, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 5e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000243, "vq_loss_layer_008": 0.000299, "vq_loss_layer_009": 0.000378, "vq_loss_layer_010": 0.000307, "vq_loss_layer_011": 0.000355, "vq_loss_layer_012": 0.000763, "vq_loss_layer_013": 0.000496, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000622, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000286, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.000767, "vq_loss_layer_022": 0.000401, "vq_loss_layer_023": 0.000694, "vq_loss_layer_024": 0.000488, "vq_loss_layer_025": 0.000614, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.0009, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.001816, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.009155 }, { "ce_loss": 2.309494, "epoch": 0.00289, "grad_norm": 0.004713158123195171, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.058209, "kv_vq_loss": 0.000715, "learning_rate": 0.0008652244606891368, "loss": 0.058954, "step": 2890, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.010315, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.013184, "value_mse_loss_layer_005": 0.013245, "value_mse_loss_layer_006": 0.015869, "value_mse_loss_layer_007": 0.018311, "value_mse_loss_layer_008": 0.020752, "value_mse_loss_layer_009": 0.027344, "value_mse_loss_layer_010": 0.024414, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.026001, "value_mse_loss_layer_013": 0.026245, "value_mse_loss_layer_014": 0.027832, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.029663, "value_mse_loss_layer_017": 0.030151, "value_mse_loss_layer_018": 0.0271, "value_mse_loss_layer_019": 0.03418, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.041504, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.04541, "value_mse_loss_layer_024": 0.046631, "value_mse_loss_layer_025": 0.05835, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.108398, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.108398, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000288, "vq_loss_layer_008": 0.000246, "vq_loss_layer_009": 0.000343, "vq_loss_layer_010": 0.000305, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.00061, "vq_loss_layer_013": 0.000443, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000626, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000222, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.000414, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.000866, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.0065 }, { "ce_loss": 2.359469, "epoch": 0.0029, "grad_norm": 0.004493036773055792, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05755, "kv_vq_loss": 0.000687, "learning_rate": 0.0008655994994747389, "loss": 0.05827, "step": 2900, "value_mse_loss_layer_000": 0.000851, "value_mse_loss_layer_001": 0.002396, "value_mse_loss_layer_002": 0.009766, "value_mse_loss_layer_003": 0.014587, "value_mse_loss_layer_004": 0.013916, "value_mse_loss_layer_005": 0.015137, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.021729, "value_mse_loss_layer_009": 0.027832, "value_mse_loss_layer_010": 0.023682, "value_mse_loss_layer_011": 0.025757, "value_mse_loss_layer_012": 0.026489, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.029053, "value_mse_loss_layer_015": 0.033691, "value_mse_loss_layer_016": 0.027588, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.029297, "value_mse_loss_layer_019": 0.033936, "value_mse_loss_layer_020": 0.036133, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.044189, "value_mse_loss_layer_024": 0.049072, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.050781, "value_mse_loss_layer_027": 0.063965, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.09375, "value_mse_loss_layer_030": 0.091309, "value_mse_loss_layer_031": 0.104492, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000206, "vq_loss_layer_007": 0.000265, "vq_loss_layer_008": 0.000292, "vq_loss_layer_009": 0.000338, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000359, "vq_loss_layer_012": 0.000549, "vq_loss_layer_013": 0.000444, "vq_loss_layer_014": 0.000572, "vq_loss_layer_015": 0.000675, "vq_loss_layer_016": 0.000618, "vq_loss_layer_017": 0.000561, "vq_loss_layer_018": 0.000368, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000389, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000393, "vq_loss_layer_024": 0.000435, "vq_loss_layer_025": 0.000492, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.000771, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.00164, "vq_loss_layer_030": 0.002991, "vq_loss_layer_031": 0.006348 }, { "ce_loss": 2.322788, "epoch": 0.00291, "grad_norm": 0.005529340356588364, "key_mse_loss_layer_000": 0.004456, "key_mse_loss_layer_001": 0.012268, "key_mse_loss_layer_002": 0.0625, "key_mse_loss_layer_003": 0.056396, "key_mse_loss_layer_004": 0.061523, "key_mse_loss_layer_005": 0.069824, "key_mse_loss_layer_006": 0.079102, "key_mse_loss_layer_007": 0.083496, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.098633, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.09375, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.093262, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.058267, "kv_vq_loss": 0.000741, "learning_rate": 0.0008659732472464767, "loss": 0.059055, "step": 2910, "value_mse_loss_layer_000": 0.000931, "value_mse_loss_layer_001": 0.002457, "value_mse_loss_layer_002": 0.010132, "value_mse_loss_layer_003": 0.016724, "value_mse_loss_layer_004": 0.014526, "value_mse_loss_layer_005": 0.014587, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.022339, "value_mse_loss_layer_009": 0.026733, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.02417, "value_mse_loss_layer_012": 0.02832, "value_mse_loss_layer_013": 0.025513, "value_mse_loss_layer_014": 0.029785, "value_mse_loss_layer_015": 0.029785, "value_mse_loss_layer_016": 0.0271, "value_mse_loss_layer_017": 0.030029, "value_mse_loss_layer_018": 0.03125, "value_mse_loss_layer_019": 0.036377, "value_mse_loss_layer_020": 0.037109, "value_mse_loss_layer_021": 0.063965, "value_mse_loss_layer_022": 0.049316, "value_mse_loss_layer_023": 0.054688, "value_mse_loss_layer_024": 0.05249, "value_mse_loss_layer_025": 0.071777, "value_mse_loss_layer_026": 0.054688, "value_mse_loss_layer_027": 0.078125, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.113281, "value_mse_loss_layer_030": 0.102051, "value_mse_loss_layer_031": 0.114258, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 0.000131, "vq_loss_layer_006": 0.000216, "vq_loss_layer_007": 0.000286, "vq_loss_layer_008": 0.000328, "vq_loss_layer_009": 0.00036, "vq_loss_layer_010": 0.000267, "vq_loss_layer_011": 0.000376, "vq_loss_layer_012": 0.000797, "vq_loss_layer_013": 0.000401, "vq_loss_layer_014": 0.000534, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000572, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000261, "vq_loss_layer_020": 0.00028, "vq_loss_layer_021": 0.000828, "vq_loss_layer_022": 0.000484, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.000456, "vq_loss_layer_025": 0.000679, "vq_loss_layer_026": 0.000725, "vq_loss_layer_027": 0.00106, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.002029, "vq_loss_layer_030": 0.00383, "vq_loss_layer_031": 0.007446 }, { "ce_loss": 2.336517, "epoch": 0.00292, "grad_norm": 0.004797122906893492, "key_mse_loss_layer_000": 0.004211, "key_mse_loss_layer_001": 0.013428, "key_mse_loss_layer_002": 0.067383, "key_mse_loss_layer_003": 0.054199, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.066895, "key_mse_loss_layer_006": 0.080566, "key_mse_loss_layer_007": 0.084961, "key_mse_loss_layer_008": 0.092285, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.106445, "key_mse_loss_layer_017": 0.110352, "key_mse_loss_layer_018": 0.124023, "key_mse_loss_layer_019": 0.10498, "key_mse_loss_layer_020": 0.118652, "key_mse_loss_layer_021": 0.104492, "key_mse_loss_layer_022": 0.108887, "key_mse_loss_layer_023": 0.115723, "key_mse_loss_layer_024": 0.091309, "key_mse_loss_layer_025": 0.088379, "key_mse_loss_layer_026": 0.105957, "key_mse_loss_layer_027": 0.109375, "key_mse_loss_layer_028": 0.111328, "key_mse_loss_layer_029": 0.103516, "key_mse_loss_layer_030": 0.113281, "key_mse_loss_layer_031": 0.09082, "kv_mse_loss": 0.058664, "kv_vq_loss": 0.000731, "learning_rate": 0.0008663457128621046, "loss": 0.059418, "step": 2920, "value_mse_loss_layer_000": 0.000874, "value_mse_loss_layer_001": 0.002472, "value_mse_loss_layer_002": 0.011047, "value_mse_loss_layer_003": 0.016235, "value_mse_loss_layer_004": 0.015625, "value_mse_loss_layer_005": 0.014465, "value_mse_loss_layer_006": 0.017456, "value_mse_loss_layer_007": 0.019531, "value_mse_loss_layer_008": 0.021118, "value_mse_loss_layer_009": 0.027466, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.02478, "value_mse_loss_layer_012": 0.026489, "value_mse_loss_layer_013": 0.028687, "value_mse_loss_layer_014": 0.028931, "value_mse_loss_layer_015": 0.029175, "value_mse_loss_layer_016": 0.027588, "value_mse_loss_layer_017": 0.032959, "value_mse_loss_layer_018": 0.032227, "value_mse_loss_layer_019": 0.039307, "value_mse_loss_layer_020": 0.04126, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.040527, "value_mse_loss_layer_023": 0.049316, "value_mse_loss_layer_024": 0.060303, "value_mse_loss_layer_025": 0.09375, "value_mse_loss_layer_026": 0.073242, "value_mse_loss_layer_027": 0.088867, "value_mse_loss_layer_028": 0.073242, "value_mse_loss_layer_029": 0.126953, "value_mse_loss_layer_030": 0.124023, "value_mse_loss_layer_031": 0.139648, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 2.7e-05, "vq_loss_layer_002": 4.2e-05, "vq_loss_layer_003": 6e-05, "vq_loss_layer_004": 0.000127, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000254, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.00032, "vq_loss_layer_009": 0.000383, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000376, "vq_loss_layer_012": 0.000637, "vq_loss_layer_013": 0.000477, "vq_loss_layer_014": 0.00058, "vq_loss_layer_015": 0.000572, "vq_loss_layer_016": 0.000557, "vq_loss_layer_017": 0.000591, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000633, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.000629, "vq_loss_layer_025": 0.001816, "vq_loss_layer_026": 0.00161, "vq_loss_layer_027": 0.001526, "vq_loss_layer_028": 0.001587, "vq_loss_layer_029": 0.003067, "vq_loss_layer_030": 0.007172, "vq_loss_layer_031": 0.014038 }, { "ce_loss": 2.318928, "epoch": 0.00293, "grad_norm": 0.004566788673400879, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.05817, "kv_vq_loss": 0.000734, "learning_rate": 0.0008667169050885271, "loss": 0.058926, "step": 2930, "value_mse_loss_layer_000": 0.000816, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.009888, "value_mse_loss_layer_003": 0.015625, "value_mse_loss_layer_004": 0.014404, "value_mse_loss_layer_005": 0.013611, "value_mse_loss_layer_006": 0.015625, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020996, "value_mse_loss_layer_009": 0.027588, "value_mse_loss_layer_010": 0.023315, "value_mse_loss_layer_011": 0.024414, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.026367, "value_mse_loss_layer_014": 0.02832, "value_mse_loss_layer_015": 0.029541, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.028931, "value_mse_loss_layer_018": 0.030518, "value_mse_loss_layer_019": 0.03418, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.040771, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.064941, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.064941, "value_mse_loss_layer_028": 0.066406, "value_mse_loss_layer_029": 0.104492, "value_mse_loss_layer_030": 0.092285, "value_mse_loss_layer_031": 0.110352, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.000273, "vq_loss_layer_009": 0.000376, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000345, "vq_loss_layer_012": 0.000538, "vq_loss_layer_013": 0.000422, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.000429, "vq_loss_layer_018": 0.000349, "vq_loss_layer_019": 0.000286, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.000607, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.000591, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000664, "vq_loss_layer_028": 0.001198, "vq_loss_layer_029": 0.001846, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.007568 }, { "ce_loss": 2.328702, "epoch": 0.00294, "grad_norm": 0.003949254751205444, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.057938, "kv_vq_loss": 0.000706, "learning_rate": 0.0008670868326030392, "loss": 0.05867, "step": 2940, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.009155, "value_mse_loss_layer_003": 0.015381, "value_mse_loss_layer_004": 0.013367, "value_mse_loss_layer_005": 0.013794, "value_mse_loss_layer_006": 0.016846, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.021729, "value_mse_loss_layer_009": 0.026855, "value_mse_loss_layer_010": 0.023926, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.025513, "value_mse_loss_layer_013": 0.0271, "value_mse_loss_layer_014": 0.028809, "value_mse_loss_layer_015": 0.031006, "value_mse_loss_layer_016": 0.026855, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.029053, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.037598, "value_mse_loss_layer_021": 0.041504, "value_mse_loss_layer_022": 0.040283, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.049561, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.051025, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.067871, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.106445, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 0.000118, "vq_loss_layer_006": 0.0002, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.000294, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000277, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000519, "vq_loss_layer_017": 0.00042, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000362, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000957, "vq_loss_layer_029": 0.001297, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.006073 }, { "ce_loss": 2.250596, "epoch": 0.00295, "grad_norm": 0.004890506621450186, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.057886, "kv_vq_loss": 0.00071, "learning_rate": 0.0008674555039945407, "loss": 0.05864, "step": 2950, "value_mse_loss_layer_000": 0.000813, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.013306, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.02124, "value_mse_loss_layer_009": 0.027222, "value_mse_loss_layer_010": 0.02356, "value_mse_loss_layer_011": 0.02417, "value_mse_loss_layer_012": 0.026245, "value_mse_loss_layer_013": 0.026245, "value_mse_loss_layer_014": 0.027588, "value_mse_loss_layer_015": 0.030762, "value_mse_loss_layer_016": 0.027222, "value_mse_loss_layer_017": 0.030273, "value_mse_loss_layer_018": 0.028198, "value_mse_loss_layer_019": 0.05127, "value_mse_loss_layer_020": 0.034668, "value_mse_loss_layer_021": 0.042969, "value_mse_loss_layer_022": 0.04126, "value_mse_loss_layer_023": 0.058594, "value_mse_loss_layer_024": 0.049072, "value_mse_loss_layer_025": 0.061768, "value_mse_loss_layer_026": 0.05249, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.101562, "value_mse_loss_layer_030": 0.089844, "value_mse_loss_layer_031": 0.105957, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.00024, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000343, "vq_loss_layer_010": 0.000286, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000561, "vq_loss_layer_013": 0.000444, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000467, "vq_loss_layer_018": 0.000282, "vq_loss_layer_019": 0.000334, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.00061, "vq_loss_layer_024": 0.000349, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001709, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.006592 }, { "ce_loss": 2.283599, "epoch": 0.00296, "grad_norm": 0.005810841452330351, "key_mse_loss_layer_000": 0.003693, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.060791, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.094238, "key_mse_loss_layer_009": 0.099609, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.10791, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.140625, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.124023, "key_mse_loss_layer_016": 0.121094, "key_mse_loss_layer_017": 0.116211, "key_mse_loss_layer_018": 0.126953, "key_mse_loss_layer_019": 0.099121, "key_mse_loss_layer_020": 0.11377, "key_mse_loss_layer_021": 0.107422, "key_mse_loss_layer_022": 0.112793, "key_mse_loss_layer_023": 0.110352, "key_mse_loss_layer_024": 0.088867, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.101074, "key_mse_loss_layer_027": 0.09668, "key_mse_loss_layer_028": 0.103516, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.105957, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.057867, "kv_vq_loss": 0.000701, "learning_rate": 0.0008678229277647345, "loss": 0.058609, "step": 2960, "value_mse_loss_layer_000": 0.000832, "value_mse_loss_layer_001": 0.00238, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.015442, "value_mse_loss_layer_004": 0.014893, "value_mse_loss_layer_005": 0.014038, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.024658, "value_mse_loss_layer_012": 0.025146, "value_mse_loss_layer_013": 0.027222, "value_mse_loss_layer_014": 0.031982, "value_mse_loss_layer_015": 0.0271, "value_mse_loss_layer_016": 0.024048, "value_mse_loss_layer_017": 0.027466, "value_mse_loss_layer_018": 0.031738, "value_mse_loss_layer_019": 0.034668, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.039307, "value_mse_loss_layer_022": 0.037109, "value_mse_loss_layer_023": 0.04248, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.062012, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.089355, "value_mse_loss_layer_030": 0.097656, "value_mse_loss_layer_031": 0.112305, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 2.7e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 0.000122, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.00019, "vq_loss_layer_007": 0.000261, "vq_loss_layer_008": 0.000319, "vq_loss_layer_009": 0.000355, "vq_loss_layer_010": 0.000349, "vq_loss_layer_011": 0.000393, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000492, "vq_loss_layer_014": 0.000725, "vq_loss_layer_015": 0.000511, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000492, "vq_loss_layer_018": 0.000471, "vq_loss_layer_019": 0.00029, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000423, "vq_loss_layer_023": 0.000549, "vq_loss_layer_024": 0.000507, "vq_loss_layer_025": 0.000828, "vq_loss_layer_026": 0.000847, "vq_loss_layer_027": 0.000999, "vq_loss_layer_028": 0.001556, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.004028, "vq_loss_layer_031": 0.009033 }, { "ce_loss": 2.253619, "epoch": 0.00297, "grad_norm": 0.006182776764035225, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.058344, "kv_vq_loss": 0.000735, "learning_rate": 0.000868189112329303, "loss": 0.059113, "step": 2970, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.009399, "value_mse_loss_layer_003": 0.014221, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.013367, "value_mse_loss_layer_006": 0.01532, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020996, "value_mse_loss_layer_009": 0.025879, "value_mse_loss_layer_010": 0.026978, "value_mse_loss_layer_011": 0.024048, "value_mse_loss_layer_012": 0.026855, "value_mse_loss_layer_013": 0.025879, "value_mse_loss_layer_014": 0.027466, "value_mse_loss_layer_015": 0.030396, "value_mse_loss_layer_016": 0.026123, "value_mse_loss_layer_017": 0.028931, "value_mse_loss_layer_018": 0.028198, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.041504, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.057861, "value_mse_loss_layer_025": 0.063477, "value_mse_loss_layer_026": 0.051025, "value_mse_loss_layer_027": 0.103027, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.106445, "value_mse_loss_layer_030": 0.094238, "value_mse_loss_layer_031": 0.109863, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000257, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000286, "vq_loss_layer_010": 0.000389, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000679, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000694, "vq_loss_layer_016": 0.000565, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000305, "vq_loss_layer_019": 0.000242, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000486, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000439, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.001587, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001808, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.006378 }, { "ce_loss": 2.300887, "epoch": 0.00298, "grad_norm": 0.00645854277536273, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.058099, "kv_vq_loss": 0.000702, "learning_rate": 0.0008685540660190637, "loss": 0.058841, "step": 2980, "value_mse_loss_layer_000": 0.00082, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.014343, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.013245, "value_mse_loss_layer_006": 0.015991, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.020752, "value_mse_loss_layer_009": 0.027466, "value_mse_loss_layer_010": 0.023804, "value_mse_loss_layer_011": 0.024536, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.027344, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.030273, "value_mse_loss_layer_016": 0.025391, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.027344, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.040039, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.040283, "value_mse_loss_layer_023": 0.04248, "value_mse_loss_layer_024": 0.047607, "value_mse_loss_layer_025": 0.063477, "value_mse_loss_layer_026": 0.046143, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.124023, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.100586, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000246, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000305, "vq_loss_layer_012": 0.000549, "vq_loss_layer_013": 0.000469, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000496, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000313, "vq_loss_layer_021": 0.000553, "vq_loss_layer_022": 0.00038, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000458, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000862, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.00293, "vq_loss_layer_031": 0.006226 }, { "ce_loss": 2.301697, "epoch": 0.00299, "grad_norm": 0.004644854459911585, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.083496, "key_mse_loss_layer_024": 0.065918, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.07373, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.057458, "kv_vq_loss": 0.000685, "learning_rate": 0.0008689177970811074, "loss": 0.058179, "step": 2990, "value_mse_loss_layer_000": 0.00082, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.009399, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.013245, "value_mse_loss_layer_005": 0.013123, "value_mse_loss_layer_006": 0.015625, "value_mse_loss_layer_007": 0.018555, "value_mse_loss_layer_008": 0.02063, "value_mse_loss_layer_009": 0.027588, "value_mse_loss_layer_010": 0.02356, "value_mse_loss_layer_011": 0.024658, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.027588, "value_mse_loss_layer_015": 0.033447, "value_mse_loss_layer_016": 0.026123, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.033691, "value_mse_loss_layer_020": 0.03418, "value_mse_loss_layer_021": 0.049561, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.048584, "value_mse_loss_layer_024": 0.044434, "value_mse_loss_layer_025": 0.060791, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.064941, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.101562, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000161, "vq_loss_layer_007": 0.000288, "vq_loss_layer_008": 0.000228, "vq_loss_layer_009": 0.000336, "vq_loss_layer_010": 0.000273, "vq_loss_layer_011": 0.000309, "vq_loss_layer_012": 0.000504, "vq_loss_layer_013": 0.000427, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.00074, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000221, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.000736, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000519, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.001419, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.005951 }, { "ce_loss": 2.308125, "epoch": 0.003, "grad_norm": 0.0052521442994475365, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.057739, "kv_vq_loss": 0.000694, "learning_rate": 0.0008692803136799154, "loss": 0.058472, "step": 3000, "value_mse_loss_layer_000": 0.00082, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.009583, "value_mse_loss_layer_003": 0.014832, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.013611, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.02124, "value_mse_loss_layer_009": 0.027222, "value_mse_loss_layer_010": 0.023071, "value_mse_loss_layer_011": 0.025146, "value_mse_loss_layer_012": 0.027588, "value_mse_loss_layer_013": 0.0271, "value_mse_loss_layer_014": 0.028931, "value_mse_loss_layer_015": 0.032227, "value_mse_loss_layer_016": 0.026855, "value_mse_loss_layer_017": 0.029907, "value_mse_loss_layer_018": 0.030151, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.042969, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.049805, "value_mse_loss_layer_025": 0.061768, "value_mse_loss_layer_026": 0.057617, "value_mse_loss_layer_027": 0.063965, "value_mse_loss_layer_028": 0.070801, "value_mse_loss_layer_029": 0.095703, "value_mse_loss_layer_030": 0.094238, "value_mse_loss_layer_031": 0.108398, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 0.000111, "vq_loss_layer_006": 0.000182, "vq_loss_layer_007": 0.00025, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000334, "vq_loss_layer_010": 0.000282, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.000641, "vq_loss_layer_013": 0.000448, "vq_loss_layer_014": 0.000546, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000443, "vq_loss_layer_018": 0.000368, "vq_loss_layer_019": 0.000236, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.000626, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000372, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000408, "vq_loss_layer_026": 0.000881, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.006653 }, { "ce_loss": 2.280771, "epoch": 0.00301, "grad_norm": 0.004957872908562422, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.011841, "key_mse_loss_layer_002": 0.063477, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.095215, "key_mse_loss_layer_009": 0.099609, "key_mse_loss_layer_010": 0.11377, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.139648, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.121582, "key_mse_loss_layer_016": 0.12207, "key_mse_loss_layer_017": 0.115234, "key_mse_loss_layer_018": 0.125977, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.116211, "key_mse_loss_layer_021": 0.107422, "key_mse_loss_layer_022": 0.117188, "key_mse_loss_layer_023": 0.118164, "key_mse_loss_layer_024": 0.098633, "key_mse_loss_layer_025": 0.092285, "key_mse_loss_layer_026": 0.10791, "key_mse_loss_layer_027": 0.111328, "key_mse_loss_layer_028": 0.114258, "key_mse_loss_layer_029": 0.104492, "key_mse_loss_layer_030": 0.119141, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.057965, "kv_vq_loss": 0.000706, "learning_rate": 0.0008696416238984608, "loss": 0.058719, "step": 3010, "value_mse_loss_layer_000": 0.000835, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.01416, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.015259, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020142, "value_mse_loss_layer_009": 0.025024, "value_mse_loss_layer_010": 0.024292, "value_mse_loss_layer_011": 0.023193, "value_mse_loss_layer_012": 0.02417, "value_mse_loss_layer_013": 0.025391, "value_mse_loss_layer_014": 0.025879, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.022949, "value_mse_loss_layer_017": 0.026489, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.033691, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.038574, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.042236, "value_mse_loss_layer_024": 0.04834, "value_mse_loss_layer_025": 0.059814, "value_mse_loss_layer_026": 0.050781, "value_mse_loss_layer_027": 0.068848, "value_mse_loss_layer_028": 0.067871, "value_mse_loss_layer_029": 0.120605, "value_mse_loss_layer_030": 0.102051, "value_mse_loss_layer_031": 0.111816, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 2.5e-05, "vq_loss_layer_002": 3.4e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.000106, "vq_loss_layer_005": 0.000108, "vq_loss_layer_006": 0.000185, "vq_loss_layer_007": 0.000257, "vq_loss_layer_008": 0.000334, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.000458, "vq_loss_layer_011": 0.00033, "vq_loss_layer_012": 0.000542, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.000557, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000349, "vq_loss_layer_019": 0.000296, "vq_loss_layer_020": 0.000326, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.000336, "vq_loss_layer_023": 0.000393, "vq_loss_layer_024": 0.000467, "vq_loss_layer_025": 0.000687, "vq_loss_layer_026": 0.000744, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.001457, "vq_loss_layer_029": 0.003082, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.009644 }, { "ce_loss": 2.27586, "epoch": 0.00302, "grad_norm": 0.004810061771422625, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.0577, "kv_vq_loss": 0.00072, "learning_rate": 0.0008700017357392875, "loss": 0.058453, "step": 3020, "value_mse_loss_layer_000": 0.00082, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.014893, "value_mse_loss_layer_004": 0.013611, "value_mse_loss_layer_005": 0.015747, "value_mse_loss_layer_006": 0.015747, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020874, "value_mse_loss_layer_009": 0.027588, "value_mse_loss_layer_010": 0.023193, "value_mse_loss_layer_011": 0.025146, "value_mse_loss_layer_012": 0.029297, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.029663, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.029297, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.033447, "value_mse_loss_layer_021": 0.039062, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.052734, "value_mse_loss_layer_024": 0.04834, "value_mse_loss_layer_025": 0.063965, "value_mse_loss_layer_026": 0.050293, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.071289, "value_mse_loss_layer_029": 0.099609, "value_mse_loss_layer_030": 0.090332, "value_mse_loss_layer_031": 0.107422, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 0.00017, "vq_loss_layer_006": 0.000169, "vq_loss_layer_007": 0.000244, "vq_loss_layer_008": 0.000273, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000299, "vq_loss_layer_011": 0.000359, "vq_loss_layer_012": 0.000824, "vq_loss_layer_013": 0.000448, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000572, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000452, "vq_loss_layer_018": 0.000309, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000599, "vq_loss_layer_024": 0.000443, "vq_loss_layer_025": 0.00053, "vq_loss_layer_026": 0.000748, "vq_loss_layer_027": 0.000816, "vq_loss_layer_028": 0.001213, "vq_loss_layer_029": 0.001663, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.006531 }, { "ce_loss": 2.294292, "epoch": 0.00303, "grad_norm": 0.004008223302662373, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.058002, "kv_vq_loss": 0.000701, "learning_rate": 0.0008703606571255761, "loss": 0.058734, "step": 3030, "value_mse_loss_layer_000": 0.000809, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.01001, "value_mse_loss_layer_003": 0.0177, "value_mse_loss_layer_004": 0.012512, "value_mse_loss_layer_005": 0.013062, "value_mse_loss_layer_006": 0.015564, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.02063, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.022827, "value_mse_loss_layer_011": 0.024292, "value_mse_loss_layer_012": 0.026123, "value_mse_loss_layer_013": 0.026001, "value_mse_loss_layer_014": 0.027344, "value_mse_loss_layer_015": 0.031006, "value_mse_loss_layer_016": 0.026489, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.027954, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.039307, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.046387, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.048828, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.096191, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.103027, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000181, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000313, "vq_loss_layer_012": 0.000652, "vq_loss_layer_013": 0.000414, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000546, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000465, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000385, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001366, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005829 }, { "ce_loss": 2.314566, "epoch": 0.00304, "grad_norm": 0.00427263043820858, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.057455, "kv_vq_loss": 0.000679, "learning_rate": 0.0008707183959021882, "loss": 0.05816, "step": 3040, "value_mse_loss_layer_000": 0.000809, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.015076, "value_mse_loss_layer_004": 0.013916, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.015442, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.022339, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.024048, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.027832, "value_mse_loss_layer_019": 0.0354, "value_mse_loss_layer_020": 0.034912, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.043945, "value_mse_loss_layer_023": 0.045898, "value_mse_loss_layer_024": 0.050049, "value_mse_loss_layer_025": 0.065918, "value_mse_loss_layer_026": 0.050049, "value_mse_loss_layer_027": 0.064941, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.100098, "value_mse_loss_layer_030": 0.092773, "value_mse_loss_layer_031": 0.101074, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 0.000126, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000284, "vq_loss_layer_008": 0.000227, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000481, "vq_loss_layer_013": 0.000414, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000515, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000269, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000423, "vq_loss_layer_022": 0.000374, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.005615 }, { "ce_loss": 2.263637, "epoch": 0.00305, "grad_norm": 0.00478446576744318, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.135742, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.119141, "key_mse_loss_layer_016": 0.11377, "key_mse_loss_layer_017": 0.115234, "key_mse_loss_layer_018": 0.121582, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.112793, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.108887, "key_mse_loss_layer_023": 0.10791, "key_mse_loss_layer_024": 0.084473, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.095215, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.058151, "kv_vq_loss": 0.000713, "learning_rate": 0.0008710749598366962, "loss": 0.058893, "step": 3050, "value_mse_loss_layer_000": 0.000797, "value_mse_loss_layer_001": 0.002365, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.014709, "value_mse_loss_layer_005": 0.014038, "value_mse_loss_layer_006": 0.016724, "value_mse_loss_layer_007": 0.018311, "value_mse_loss_layer_008": 0.020874, "value_mse_loss_layer_009": 0.026978, "value_mse_loss_layer_010": 0.02356, "value_mse_loss_layer_011": 0.02356, "value_mse_loss_layer_012": 0.024048, "value_mse_loss_layer_013": 0.026123, "value_mse_loss_layer_014": 0.027832, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.028564, "value_mse_loss_layer_018": 0.026978, "value_mse_loss_layer_019": 0.031982, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.04834, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.040527, "value_mse_loss_layer_024": 0.044189, "value_mse_loss_layer_025": 0.055664, "value_mse_loss_layer_026": 0.049805, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.09375, "value_mse_loss_layer_030": 0.087402, "value_mse_loss_layer_031": 0.108398, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 2.8e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 5.3e-05, "vq_loss_layer_004": 0.000133, "vq_loss_layer_005": 0.000119, "vq_loss_layer_006": 0.000238, "vq_loss_layer_007": 0.000275, "vq_loss_layer_008": 0.000351, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.00037, "vq_loss_layer_011": 0.000351, "vq_loss_layer_012": 0.000534, "vq_loss_layer_013": 0.000418, "vq_loss_layer_014": 0.000648, "vq_loss_layer_015": 0.000496, "vq_loss_layer_016": 0.000477, "vq_loss_layer_017": 0.000433, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.000273, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000919, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.00045, "vq_loss_layer_024": 0.000372, "vq_loss_layer_025": 0.000675, "vq_loss_layer_026": 0.000938, "vq_loss_layer_027": 0.001083, "vq_loss_layer_028": 0.001274, "vq_loss_layer_029": 0.001839, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.00885 }, { "ce_loss": 2.279418, "epoch": 0.00306, "grad_norm": 0.003260956611484289, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.057806, "kv_vq_loss": 0.000705, "learning_rate": 0.0008714303566203949, "loss": 0.058524, "step": 3060, "value_mse_loss_layer_000": 0.000816, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.012817, "value_mse_loss_layer_005": 0.012878, "value_mse_loss_layer_006": 0.01532, "value_mse_loss_layer_007": 0.0177, "value_mse_loss_layer_008": 0.020874, "value_mse_loss_layer_009": 0.026733, "value_mse_loss_layer_010": 0.022583, "value_mse_loss_layer_011": 0.025146, "value_mse_loss_layer_012": 0.025635, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.030273, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.026245, "value_mse_loss_layer_017": 0.03125, "value_mse_loss_layer_018": 0.027344, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.042969, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.058105, "value_mse_loss_layer_026": 0.048096, "value_mse_loss_layer_027": 0.062256, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.091797, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.099121, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000161, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000332, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000391, "vq_loss_layer_012": 0.00058, "vq_loss_layer_013": 0.000425, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000599, "vq_loss_layer_016": 0.000475, "vq_loss_layer_017": 0.000538, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000215, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.301518, "epoch": 0.00307, "grad_norm": 0.004449224565178156, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.057867, "kv_vq_loss": 0.000719, "learning_rate": 0.0008717845938692965, "loss": 0.058618, "step": 3070, "value_mse_loss_layer_000": 0.000801, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.009399, "value_mse_loss_layer_003": 0.014771, "value_mse_loss_layer_004": 0.01416, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.019043, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.021118, "value_mse_loss_layer_009": 0.026733, "value_mse_loss_layer_010": 0.02356, "value_mse_loss_layer_011": 0.024536, "value_mse_loss_layer_012": 0.025635, "value_mse_loss_layer_013": 0.028076, "value_mse_loss_layer_014": 0.027588, "value_mse_loss_layer_015": 0.030273, "value_mse_loss_layer_016": 0.026367, "value_mse_loss_layer_017": 0.030518, "value_mse_loss_layer_018": 0.029785, "value_mse_loss_layer_019": 0.037109, "value_mse_loss_layer_020": 0.040771, "value_mse_loss_layer_021": 0.040283, "value_mse_loss_layer_022": 0.039795, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.046875, "value_mse_loss_layer_025": 0.064453, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.06543, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.09668, "value_mse_loss_layer_030": 0.091309, "value_mse_loss_layer_031": 0.104004, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.000127, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000324, "vq_loss_layer_007": 0.000259, "vq_loss_layer_008": 0.000259, "vq_loss_layer_009": 0.000309, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.000561, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000515, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000313, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000362, "vq_loss_layer_021": 0.000475, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.000454, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001488, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.005829 }, { "ce_loss": 2.310069, "epoch": 0.00308, "grad_norm": 0.004622325301170349, "key_mse_loss_layer_000": 0.00412, "key_mse_loss_layer_001": 0.011475, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.057364, "kv_vq_loss": 0.000689, "learning_rate": 0.0008721376791251109, "loss": 0.05809, "step": 3080, "value_mse_loss_layer_000": 0.000843, "value_mse_loss_layer_001": 0.002411, "value_mse_loss_layer_002": 0.009888, "value_mse_loss_layer_003": 0.015259, "value_mse_loss_layer_004": 0.01416, "value_mse_loss_layer_005": 0.014343, "value_mse_loss_layer_006": 0.016602, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.022461, "value_mse_loss_layer_009": 0.027588, "value_mse_loss_layer_010": 0.025635, "value_mse_loss_layer_011": 0.025391, "value_mse_loss_layer_012": 0.025757, "value_mse_loss_layer_013": 0.027466, "value_mse_loss_layer_014": 0.029419, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.028809, "value_mse_loss_layer_017": 0.03064, "value_mse_loss_layer_018": 0.02832, "value_mse_loss_layer_019": 0.032471, "value_mse_loss_layer_020": 0.035645, "value_mse_loss_layer_021": 0.043213, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.042969, "value_mse_loss_layer_024": 0.049316, "value_mse_loss_layer_025": 0.057129, "value_mse_loss_layer_026": 0.048828, "value_mse_loss_layer_027": 0.079102, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.092773, "value_mse_loss_layer_030": 0.089844, "value_mse_loss_layer_031": 0.110352, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 9.1e-05, "vq_loss_layer_005": 0.000121, "vq_loss_layer_006": 0.000187, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.00034, "vq_loss_layer_009": 0.00036, "vq_loss_layer_010": 0.000341, "vq_loss_layer_011": 0.000357, "vq_loss_layer_012": 0.000534, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000679, "vq_loss_layer_016": 0.000683, "vq_loss_layer_017": 0.000576, "vq_loss_layer_018": 0.000353, "vq_loss_layer_019": 0.000286, "vq_loss_layer_020": 0.00036, "vq_loss_layer_021": 0.000671, "vq_loss_layer_022": 0.000393, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000492, "vq_loss_layer_025": 0.000546, "vq_loss_layer_026": 0.000782, "vq_loss_layer_027": 0.001312, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001892, "vq_loss_layer_030": 0.003632, "vq_loss_layer_031": 0.007751 }, { "ce_loss": 2.312417, "epoch": 0.00309, "grad_norm": 0.004294517915695906, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.057986, "kv_vq_loss": 0.000714, "learning_rate": 0.0008724896198562085, "loss": 0.058725, "step": 3090, "value_mse_loss_layer_000": 0.000771, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009155, "value_mse_loss_layer_003": 0.014587, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.013489, "value_mse_loss_layer_006": 0.015869, "value_mse_loss_layer_007": 0.018066, "value_mse_loss_layer_008": 0.02124, "value_mse_loss_layer_009": 0.026489, "value_mse_loss_layer_010": 0.023438, "value_mse_loss_layer_011": 0.025024, "value_mse_loss_layer_012": 0.026001, "value_mse_loss_layer_013": 0.028198, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.026855, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.033691, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.040771, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.045654, "value_mse_loss_layer_024": 0.048584, "value_mse_loss_layer_025": 0.057861, "value_mse_loss_layer_026": 0.052979, "value_mse_loss_layer_027": 0.069336, "value_mse_loss_layer_028": 0.072754, "value_mse_loss_layer_029": 0.099609, "value_mse_loss_layer_030": 0.091309, "value_mse_loss_layer_031": 0.110352, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000173, "vq_loss_layer_007": 0.000265, "vq_loss_layer_008": 0.000313, "vq_loss_layer_009": 0.000334, "vq_loss_layer_010": 0.00034, "vq_loss_layer_011": 0.00036, "vq_loss_layer_012": 0.000568, "vq_loss_layer_013": 0.000542, "vq_loss_layer_014": 0.000572, "vq_loss_layer_015": 0.000622, "vq_loss_layer_016": 0.000599, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000504, "vq_loss_layer_019": 0.000257, "vq_loss_layer_020": 0.000315, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000454, "vq_loss_layer_024": 0.000418, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000824, "vq_loss_layer_027": 0.000839, "vq_loss_layer_028": 0.001442, "vq_loss_layer_029": 0.00206, "vq_loss_layer_030": 0.003693, "vq_loss_layer_031": 0.008423 }, { "ce_loss": 2.304273, "epoch": 0.0031, "grad_norm": 0.004265791270881891, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.116699, "key_mse_loss_layer_016": 0.110352, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.115234, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.057401, "kv_vq_loss": 0.000691, "learning_rate": 0.000872840423458568, "loss": 0.058124, "step": 3100, "value_mse_loss_layer_000": 0.000809, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.009216, "value_mse_loss_layer_003": 0.014771, "value_mse_loss_layer_004": 0.013733, "value_mse_loss_layer_005": 0.01355, "value_mse_loss_layer_006": 0.015503, "value_mse_loss_layer_007": 0.017212, "value_mse_loss_layer_008": 0.021362, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.022461, "value_mse_loss_layer_011": 0.02356, "value_mse_loss_layer_012": 0.025391, "value_mse_loss_layer_013": 0.026001, "value_mse_loss_layer_014": 0.026733, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.023438, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.0271, "value_mse_loss_layer_019": 0.031982, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.059814, "value_mse_loss_layer_026": 0.044678, "value_mse_loss_layer_027": 0.059326, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.102539, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000119, "vq_loss_layer_006": 0.00017, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.000374, "vq_loss_layer_009": 0.000359, "vq_loss_layer_010": 0.000349, "vq_loss_layer_011": 0.000326, "vq_loss_layer_012": 0.000626, "vq_loss_layer_013": 0.000439, "vq_loss_layer_014": 0.00061, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.000515, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000257, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000637, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000437, "vq_loss_layer_024": 0.000423, "vq_loss_layer_025": 0.000778, "vq_loss_layer_026": 0.000801, "vq_loss_layer_027": 0.000908, "vq_loss_layer_028": 0.001656, "vq_loss_layer_029": 0.00177, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.007874 }, { "ce_loss": 2.271326, "epoch": 0.00311, "grad_norm": 0.004193413536995649, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.057922, "kv_vq_loss": 0.000716, "learning_rate": 0.0008731900972567092, "loss": 0.058655, "step": 3110, "value_mse_loss_layer_000": 0.000828, "value_mse_loss_layer_001": 0.00235, "value_mse_loss_layer_002": 0.009277, "value_mse_loss_layer_003": 0.014404, "value_mse_loss_layer_004": 0.013916, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.015991, "value_mse_loss_layer_007": 0.017822, "value_mse_loss_layer_008": 0.021118, "value_mse_loss_layer_009": 0.027954, "value_mse_loss_layer_010": 0.025269, "value_mse_loss_layer_011": 0.024414, "value_mse_loss_layer_012": 0.025391, "value_mse_loss_layer_013": 0.026978, "value_mse_loss_layer_014": 0.028564, "value_mse_loss_layer_015": 0.035889, "value_mse_loss_layer_016": 0.027466, "value_mse_loss_layer_017": 0.030151, "value_mse_loss_layer_018": 0.029053, "value_mse_loss_layer_019": 0.039062, "value_mse_loss_layer_020": 0.039551, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.057129, "value_mse_loss_layer_024": 0.047852, "value_mse_loss_layer_025": 0.05957, "value_mse_loss_layer_026": 0.054688, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.099609, "value_mse_loss_layer_030": 0.097656, "value_mse_loss_layer_031": 0.105957, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.000104, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000259, "vq_loss_layer_009": 0.000383, "vq_loss_layer_010": 0.000334, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000465, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000748, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000334, "vq_loss_layer_019": 0.000263, "vq_loss_layer_020": 0.000267, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.000317, "vq_loss_layer_023": 0.000603, "vq_loss_layer_024": 0.000395, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000675, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.00164, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.006104 }, { "ce_loss": 2.299809, "epoch": 0.00312, "grad_norm": 0.006894371006637812, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.057373, "kv_vq_loss": 0.000696, "learning_rate": 0.0008735386485046107, "loss": 0.058102, "step": 3120, "value_mse_loss_layer_000": 0.00082, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.015625, "value_mse_loss_layer_004": 0.013184, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.015747, "value_mse_loss_layer_007": 0.017212, "value_mse_loss_layer_008": 0.021118, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.022583, "value_mse_loss_layer_011": 0.024048, "value_mse_loss_layer_012": 0.024536, "value_mse_loss_layer_013": 0.026611, "value_mse_loss_layer_014": 0.028564, "value_mse_loss_layer_015": 0.030273, "value_mse_loss_layer_016": 0.027832, "value_mse_loss_layer_017": 0.030151, "value_mse_loss_layer_018": 0.027832, "value_mse_loss_layer_019": 0.0354, "value_mse_loss_layer_020": 0.035645, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.045654, "value_mse_loss_layer_024": 0.047852, "value_mse_loss_layer_025": 0.057617, "value_mse_loss_layer_026": 0.050781, "value_mse_loss_layer_027": 0.069824, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.137695, "value_mse_loss_layer_030": 0.09082, "value_mse_loss_layer_031": 0.103027, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.000254, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000341, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000448, "vq_loss_layer_014": 0.00053, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000572, "vq_loss_layer_017": 0.000549, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000315, "vq_loss_layer_020": 0.000324, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.00235, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.006409 }, { "ce_loss": 2.330912, "epoch": 0.00313, "grad_norm": 0.004198032431304455, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.057083, "kv_vq_loss": 0.000674, "learning_rate": 0.000873886084386612, "loss": 0.057794, "step": 3130, "value_mse_loss_layer_000": 0.000816, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009216, "value_mse_loss_layer_003": 0.015015, "value_mse_loss_layer_004": 0.012634, "value_mse_loss_layer_005": 0.012817, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020996, "value_mse_loss_layer_009": 0.026855, "value_mse_loss_layer_010": 0.023071, "value_mse_loss_layer_011": 0.025269, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.027222, "value_mse_loss_layer_014": 0.028198, "value_mse_loss_layer_015": 0.030518, "value_mse_loss_layer_016": 0.027466, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.048828, "value_mse_loss_layer_022": 0.039062, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.046143, "value_mse_loss_layer_025": 0.056641, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.064941, "value_mse_loss_layer_029": 0.095215, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.104004, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000182, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000238, "vq_loss_layer_009": 0.000319, "vq_loss_layer_010": 0.000263, "vq_loss_layer_011": 0.000345, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000475, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000542, "vq_loss_layer_017": 0.000538, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001839, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.006378 }, { "ce_loss": 2.312598, "epoch": 0.00314, "grad_norm": 0.004622454755008221, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.07373, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.057422, "kv_vq_loss": 0.000675, "learning_rate": 0.0008742324120183038, "loss": 0.058127, "step": 3140, "value_mse_loss_layer_000": 0.000824, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.014221, "value_mse_loss_layer_004": 0.014832, "value_mse_loss_layer_005": 0.013489, "value_mse_loss_layer_006": 0.015625, "value_mse_loss_layer_007": 0.0177, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.02832, "value_mse_loss_layer_011": 0.025635, "value_mse_loss_layer_012": 0.024902, "value_mse_loss_layer_013": 0.027466, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.031006, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.030518, "value_mse_loss_layer_018": 0.028809, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.040527, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.041992, "value_mse_loss_layer_024": 0.050537, "value_mse_loss_layer_025": 0.083984, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.060303, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.087402, "value_mse_loss_layer_030": 0.08252, "value_mse_loss_layer_031": 0.099609, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.000154, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000164, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000437, "vq_loss_layer_011": 0.000366, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.000488, "vq_loss_layer_014": 0.000675, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000241, "vq_loss_layer_020": 0.000347, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000372, "vq_loss_layer_023": 0.000378, "vq_loss_layer_024": 0.000433, "vq_loss_layer_025": 0.000698, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.001511, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.006165 }, { "ce_loss": 2.247398, "epoch": 0.00315, "grad_norm": 0.0059590646997094154, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.058032, "kv_vq_loss": 0.000722, "learning_rate": 0.0008745776384474, "loss": 0.058783, "step": 3150, "value_mse_loss_layer_000": 0.000805, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.014771, "value_mse_loss_layer_004": 0.013367, "value_mse_loss_layer_005": 0.013062, "value_mse_loss_layer_006": 0.015564, "value_mse_loss_layer_007": 0.017334, "value_mse_loss_layer_008": 0.021606, "value_mse_loss_layer_009": 0.026489, "value_mse_loss_layer_010": 0.02417, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.024902, "value_mse_loss_layer_013": 0.026367, "value_mse_loss_layer_014": 0.028564, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.025391, "value_mse_loss_layer_017": 0.029541, "value_mse_loss_layer_018": 0.027954, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.039795, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.043457, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.062988, "value_mse_loss_layer_026": 0.050537, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.066406, "value_mse_loss_layer_029": 0.116211, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.099609, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000237, "vq_loss_layer_008": 0.00029, "vq_loss_layer_009": 0.000322, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000301, "vq_loss_layer_012": 0.000515, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000431, "vq_loss_layer_018": 0.000313, "vq_loss_layer_019": 0.000234, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000568, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000397, "vq_loss_layer_024": 0.000408, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000683, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.001747, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.00592 }, { "ce_loss": 2.273943, "epoch": 0.00316, "grad_norm": 0.0030772127211093903, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.058063, "kv_vq_loss": 0.000746, "learning_rate": 0.0008749217706546009, "loss": 0.058832, "step": 3160, "value_mse_loss_layer_000": 0.00082, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.014343, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.015869, "value_mse_loss_layer_007": 0.0177, "value_mse_loss_layer_008": 0.020752, "value_mse_loss_layer_009": 0.027222, "value_mse_loss_layer_010": 0.023315, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.026001, "value_mse_loss_layer_013": 0.026611, "value_mse_loss_layer_014": 0.029907, "value_mse_loss_layer_015": 0.030396, "value_mse_loss_layer_016": 0.027588, "value_mse_loss_layer_017": 0.030151, "value_mse_loss_layer_018": 0.027954, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.039795, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.041748, "value_mse_loss_layer_024": 0.045898, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.049072, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.092285, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.103027, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000164, "vq_loss_layer_007": 0.000261, "vq_loss_layer_008": 0.000256, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000568, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.00058, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000534, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.000213, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000359, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001534, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.0065 }, { "ce_loss": 2.253732, "epoch": 0.00317, "grad_norm": 0.004418065771460533, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.057312, "kv_vq_loss": 0.000702, "learning_rate": 0.0008752648155544378, "loss": 0.058029, "step": 3170, "value_mse_loss_layer_000": 0.00079, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009155, "value_mse_loss_layer_003": 0.014587, "value_mse_loss_layer_004": 0.013428, "value_mse_loss_layer_005": 0.013916, "value_mse_loss_layer_006": 0.015747, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.02124, "value_mse_loss_layer_009": 0.026367, "value_mse_loss_layer_010": 0.022339, "value_mse_loss_layer_011": 0.024292, "value_mse_loss_layer_012": 0.025757, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.03064, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.030762, "value_mse_loss_layer_018": 0.027832, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.036377, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.058838, "value_mse_loss_layer_026": 0.047119, "value_mse_loss_layer_027": 0.060547, "value_mse_loss_layer_028": 0.069824, "value_mse_loss_layer_029": 0.099121, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.102051, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000118, "vq_loss_layer_006": 0.000169, "vq_loss_layer_007": 0.000243, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000298, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000326, "vq_loss_layer_012": 0.000542, "vq_loss_layer_013": 0.000467, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000561, "vq_loss_layer_018": 0.000307, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000399, "vq_loss_layer_024": 0.000341, "vq_loss_layer_025": 0.000427, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.00174, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.006744 }, { "ce_loss": 2.328019, "epoch": 0.00318, "grad_norm": 0.004636823665350676, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.057739, "kv_vq_loss": 0.000705, "learning_rate": 0.000875606779996108, "loss": 0.058475, "step": 3180, "value_mse_loss_layer_000": 0.000805, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.014832, "value_mse_loss_layer_004": 0.01355, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.015747, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020752, "value_mse_loss_layer_009": 0.026855, "value_mse_loss_layer_010": 0.029053, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.025391, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.028076, "value_mse_loss_layer_015": 0.031494, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.032471, "value_mse_loss_layer_020": 0.03418, "value_mse_loss_layer_021": 0.04541, "value_mse_loss_layer_022": 0.042725, "value_mse_loss_layer_023": 0.047363, "value_mse_loss_layer_024": 0.048096, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.052979, "value_mse_loss_layer_027": 0.06543, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.101562, "value_mse_loss_layer_030": 0.092285, "value_mse_loss_layer_031": 0.106934, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 8.6e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000303, "vq_loss_layer_010": 0.00038, "vq_loss_layer_011": 0.00036, "vq_loss_layer_012": 0.00053, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000519, "vq_loss_layer_017": 0.000462, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000232, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000713, "vq_loss_layer_022": 0.000404, "vq_loss_layer_023": 0.000408, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.001793, "vq_loss_layer_030": 0.002853, "vq_loss_layer_031": 0.006927 }, { "ce_loss": 2.262441, "epoch": 0.00319, "grad_norm": 0.004747536964714527, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.057825, "kv_vq_loss": 0.00071, "learning_rate": 0.0008759476707642952, "loss": 0.058557, "step": 3190, "value_mse_loss_layer_000": 0.000801, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.014587, "value_mse_loss_layer_004": 0.013062, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.018066, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020874, "value_mse_loss_layer_009": 0.026855, "value_mse_loss_layer_010": 0.024902, "value_mse_loss_layer_011": 0.024292, "value_mse_loss_layer_012": 0.026367, "value_mse_loss_layer_013": 0.026123, "value_mse_loss_layer_014": 0.028198, "value_mse_loss_layer_015": 0.030518, "value_mse_loss_layer_016": 0.025024, "value_mse_loss_layer_017": 0.029907, "value_mse_loss_layer_018": 0.0271, "value_mse_loss_layer_019": 0.032471, "value_mse_loss_layer_020": 0.034912, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.042236, "value_mse_loss_layer_024": 0.057129, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.095703, "value_mse_loss_layer_030": 0.085449, "value_mse_loss_layer_031": 0.103516, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000286, "vq_loss_layer_007": 0.000275, "vq_loss_layer_008": 0.000265, "vq_loss_layer_009": 0.000347, "vq_loss_layer_010": 0.000307, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000656, "vq_loss_layer_013": 0.000408, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000443, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.000223, "vq_loss_layer_020": 0.000332, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.000374, "vq_loss_layer_024": 0.000477, "vq_loss_layer_025": 0.000433, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000774, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001419, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.005676 }, { "ce_loss": 2.31682, "epoch": 0.0032, "grad_norm": 0.004639438819140196, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.057602, "kv_vq_loss": 0.000686, "learning_rate": 0.0008762874945799765, "loss": 0.058325, "step": 3200, "value_mse_loss_layer_000": 0.000801, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.009399, "value_mse_loss_layer_003": 0.015076, "value_mse_loss_layer_004": 0.013123, "value_mse_loss_layer_005": 0.013245, "value_mse_loss_layer_006": 0.015442, "value_mse_loss_layer_007": 0.016968, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.025879, "value_mse_loss_layer_010": 0.022339, "value_mse_loss_layer_011": 0.023804, "value_mse_loss_layer_012": 0.024048, "value_mse_loss_layer_013": 0.028687, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.028809, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.028076, "value_mse_loss_layer_019": 0.031128, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.036133, "value_mse_loss_layer_023": 0.049316, "value_mse_loss_layer_024": 0.049805, "value_mse_loss_layer_025": 0.057373, "value_mse_loss_layer_026": 0.050049, "value_mse_loss_layer_027": 0.062988, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.092285, "value_mse_loss_layer_030": 0.091309, "value_mse_loss_layer_031": 0.101074, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.000341, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000637, "vq_loss_layer_014": 0.000641, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000557, "vq_loss_layer_017": 0.000458, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000241, "vq_loss_layer_020": 0.00032, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000496, "vq_loss_layer_024": 0.000408, "vq_loss_layer_025": 0.000454, "vq_loss_layer_026": 0.000797, "vq_loss_layer_027": 0.000732, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.00386, "vq_loss_layer_031": 0.006531 }, { "ce_loss": 2.32018, "epoch": 0.00321, "grad_norm": 0.00417208019644022, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.057001, "kv_vq_loss": 0.000692, "learning_rate": 0.0008766262581012178, "loss": 0.057709, "step": 3210, "value_mse_loss_layer_000": 0.000801, "value_mse_loss_layer_001": 0.002411, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.014282, "value_mse_loss_layer_004": 0.013, "value_mse_loss_layer_005": 0.013123, "value_mse_loss_layer_006": 0.015259, "value_mse_loss_layer_007": 0.016846, "value_mse_loss_layer_008": 0.020752, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.023804, "value_mse_loss_layer_012": 0.024048, "value_mse_loss_layer_013": 0.025635, "value_mse_loss_layer_014": 0.0271, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.025269, "value_mse_loss_layer_017": 0.028931, "value_mse_loss_layer_018": 0.026733, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.039307, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.041992, "value_mse_loss_layer_024": 0.04834, "value_mse_loss_layer_025": 0.061279, "value_mse_loss_layer_026": 0.049805, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.091309, "value_mse_loss_layer_030": 0.085449, "value_mse_loss_layer_031": 0.103027, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000278, "vq_loss_layer_009": 0.000301, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.000414, "vq_loss_layer_014": 0.00053, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000496, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000214, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.00037, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.000809, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.001671, "vq_loss_layer_030": 0.003281, "vq_loss_layer_031": 0.006958 }, { "ce_loss": 2.273839, "epoch": 0.00322, "grad_norm": 0.005170504562556744, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.058044, "kv_vq_loss": 0.000719, "learning_rate": 0.0008769639679239576, "loss": 0.058801, "step": 3220, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009338, "value_mse_loss_layer_003": 0.014832, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.016846, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.023926, "value_mse_loss_layer_011": 0.023438, "value_mse_loss_layer_012": 0.025757, "value_mse_loss_layer_013": 0.025269, "value_mse_loss_layer_014": 0.028076, "value_mse_loss_layer_015": 0.029297, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.02832, "value_mse_loss_layer_018": 0.028076, "value_mse_loss_layer_019": 0.031494, "value_mse_loss_layer_020": 0.034912, "value_mse_loss_layer_021": 0.05835, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.049072, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.102051, "value_mse_loss_layer_030": 0.089844, "value_mse_loss_layer_031": 0.107422, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 9.6e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.000237, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.00038, "vq_loss_layer_010": 0.000364, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.000626, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000603, "vq_loss_layer_015": 0.000565, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000404, "vq_loss_layer_018": 0.000305, "vq_loss_layer_019": 0.00024, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000858, "vq_loss_layer_022": 0.000332, "vq_loss_layer_023": 0.000423, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000437, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001587, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.006989 }, { "ce_loss": 2.323364, "epoch": 0.00323, "grad_norm": 0.0036497118417173624, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.057098, "kv_vq_loss": 0.00068, "learning_rate": 0.0008773006305827757, "loss": 0.057782, "step": 3230, "value_mse_loss_layer_000": 0.000809, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.014282, "value_mse_loss_layer_004": 0.012817, "value_mse_loss_layer_005": 0.013184, "value_mse_loss_layer_006": 0.015869, "value_mse_loss_layer_007": 0.017334, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.026978, "value_mse_loss_layer_010": 0.022949, "value_mse_loss_layer_011": 0.025269, "value_mse_loss_layer_012": 0.02478, "value_mse_loss_layer_013": 0.026245, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.025269, "value_mse_loss_layer_017": 0.031738, "value_mse_loss_layer_018": 0.028076, "value_mse_loss_layer_019": 0.032471, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.044189, "value_mse_loss_layer_022": 0.038086, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.04541, "value_mse_loss_layer_025": 0.05835, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.062988, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.103027, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.100098, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000186, "vq_loss_layer_007": 0.000257, "vq_loss_layer_008": 0.000241, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.000286, "vq_loss_layer_011": 0.000372, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000452, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000587, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.000278, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000629, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.00041, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.001839, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.006195 }, { "ce_loss": 2.275898, "epoch": 0.00324, "grad_norm": 0.0051767295226454735, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.057648, "kv_vq_loss": 0.000709, "learning_rate": 0.000877636252551653, "loss": 0.058383, "step": 3240, "value_mse_loss_layer_000": 0.000793, "value_mse_loss_layer_001": 0.002258, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.013855, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.012634, "value_mse_loss_layer_006": 0.014893, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.020142, "value_mse_loss_layer_009": 0.025391, "value_mse_loss_layer_010": 0.022339, "value_mse_loss_layer_011": 0.023315, "value_mse_loss_layer_012": 0.023926, "value_mse_loss_layer_013": 0.024536, "value_mse_loss_layer_014": 0.027832, "value_mse_loss_layer_015": 0.028809, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.028442, "value_mse_loss_layer_018": 0.028198, "value_mse_loss_layer_019": 0.031982, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.040771, "value_mse_loss_layer_022": 0.039795, "value_mse_loss_layer_023": 0.045898, "value_mse_loss_layer_024": 0.050293, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.069336, "value_mse_loss_layer_027": 0.064941, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.098633, "value_mse_loss_layer_030": 0.088867, "value_mse_loss_layer_031": 0.102051, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000241, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000473, "vq_loss_layer_017": 0.000423, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.000233, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000347, "vq_loss_layer_023": 0.000374, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.001213, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001419, "vq_loss_layer_030": 0.002609, "vq_loss_layer_031": 0.006104 }, { "ce_loss": 2.319543, "epoch": 0.00325, "grad_norm": 0.004086723551154137, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.056967, "kv_vq_loss": 0.000675, "learning_rate": 0.0008779708402447184, "loss": 0.057657, "step": 3250, "value_mse_loss_layer_000": 0.000809, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.014893, "value_mse_loss_layer_004": 0.013794, "value_mse_loss_layer_005": 0.012878, "value_mse_loss_layer_006": 0.016235, "value_mse_loss_layer_007": 0.017334, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.02356, "value_mse_loss_layer_012": 0.02417, "value_mse_loss_layer_013": 0.025513, "value_mse_loss_layer_014": 0.026733, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.027588, "value_mse_loss_layer_017": 0.029419, "value_mse_loss_layer_018": 0.028442, "value_mse_loss_layer_019": 0.031006, "value_mse_loss_layer_020": 0.037842, "value_mse_loss_layer_021": 0.039795, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.045898, "value_mse_loss_layer_024": 0.045898, "value_mse_loss_layer_025": 0.05957, "value_mse_loss_layer_026": 0.053223, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.093262, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.101074, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.000138, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000216, "vq_loss_layer_007": 0.000265, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000324, "vq_loss_layer_010": 0.000294, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.00053, "vq_loss_layer_013": 0.000425, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000607, "vq_loss_layer_016": 0.000618, "vq_loss_layer_017": 0.000496, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.000215, "vq_loss_layer_020": 0.00038, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000332, "vq_loss_layer_023": 0.000395, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000467, "vq_loss_layer_026": 0.000824, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.00148, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.006226 }, { "ce_loss": 2.280445, "epoch": 0.00326, "grad_norm": 0.0058534168638288975, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.057419, "kv_vq_loss": 0.000695, "learning_rate": 0.0008783044000169846, "loss": 0.058154, "step": 3260, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002258, "value_mse_loss_layer_002": 0.009644, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.013977, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.016968, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.026855, "value_mse_loss_layer_010": 0.021973, "value_mse_loss_layer_011": 0.023438, "value_mse_loss_layer_012": 0.025513, "value_mse_loss_layer_013": 0.025269, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.027344, "value_mse_loss_layer_016": 0.024536, "value_mse_loss_layer_017": 0.02771, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.03064, "value_mse_loss_layer_020": 0.031982, "value_mse_loss_layer_021": 0.05249, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.041016, "value_mse_loss_layer_024": 0.048584, "value_mse_loss_layer_025": 0.064453, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.0625, "value_mse_loss_layer_028": 0.066406, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.088379, "value_mse_loss_layer_031": 0.105957, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 0.000111, "vq_loss_layer_005": 0.000117, "vq_loss_layer_006": 0.000167, "vq_loss_layer_007": 0.000237, "vq_loss_layer_008": 0.000334, "vq_loss_layer_009": 0.000422, "vq_loss_layer_010": 0.00036, "vq_loss_layer_011": 0.000351, "vq_loss_layer_012": 0.000599, "vq_loss_layer_013": 0.000465, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.00058, "vq_loss_layer_017": 0.000486, "vq_loss_layer_018": 0.000414, "vq_loss_layer_019": 0.000265, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.00103, "vq_loss_layer_022": 0.000427, "vq_loss_layer_023": 0.0005, "vq_loss_layer_024": 0.000622, "vq_loss_layer_025": 0.000912, "vq_loss_layer_026": 0.000862, "vq_loss_layer_027": 0.000965, "vq_loss_layer_028": 0.001503, "vq_loss_layer_029": 0.002502, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.009094 }, { "ce_loss": 2.343792, "epoch": 0.00327, "grad_norm": 0.005237058736383915, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.011597, "key_mse_loss_layer_002": 0.062256, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.111816, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.116699, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.106934, "key_mse_loss_layer_024": 0.086426, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.101562, "key_mse_loss_layer_029": 0.09375, "key_mse_loss_layer_030": 0.099121, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.05752, "kv_vq_loss": 0.000685, "learning_rate": 0.0008786369381650714, "loss": 0.058237, "step": 3270, "value_mse_loss_layer_000": 0.000778, "value_mse_loss_layer_001": 0.002304, "value_mse_loss_layer_002": 0.009155, "value_mse_loss_layer_003": 0.015442, "value_mse_loss_layer_004": 0.013611, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.016479, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.023682, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.026733, "value_mse_loss_layer_015": 0.026001, "value_mse_loss_layer_016": 0.025146, "value_mse_loss_layer_017": 0.027466, "value_mse_loss_layer_018": 0.029297, "value_mse_loss_layer_019": 0.032471, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.067383, "value_mse_loss_layer_024": 0.05249, "value_mse_loss_layer_025": 0.062988, "value_mse_loss_layer_026": 0.05249, "value_mse_loss_layer_027": 0.075195, "value_mse_loss_layer_028": 0.069824, "value_mse_loss_layer_029": 0.111328, "value_mse_loss_layer_030": 0.101562, "value_mse_loss_layer_031": 0.114746, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 2.5e-05, "vq_loss_layer_002": 2.7e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 0.000117, "vq_loss_layer_006": 0.00025, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.000328, "vq_loss_layer_009": 0.000338, "vq_loss_layer_010": 0.000345, "vq_loss_layer_011": 0.000366, "vq_loss_layer_012": 0.000526, "vq_loss_layer_013": 0.000492, "vq_loss_layer_014": 0.00061, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.00066, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.00041, "vq_loss_layer_019": 0.000326, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000391, "vq_loss_layer_023": 0.000832, "vq_loss_layer_024": 0.000402, "vq_loss_layer_025": 0.000641, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.000999, "vq_loss_layer_028": 0.001373, "vq_loss_layer_029": 0.002487, "vq_loss_layer_030": 0.003357, "vq_loss_layer_031": 0.009094 }, { "ce_loss": 2.290813, "epoch": 0.00328, "grad_norm": 0.00380706530995667, "key_mse_loss_layer_000": 0.002655, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.108887, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.141602, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.125977, "key_mse_loss_layer_016": 0.118652, "key_mse_loss_layer_017": 0.120117, "key_mse_loss_layer_018": 0.124023, "key_mse_loss_layer_019": 0.100098, "key_mse_loss_layer_020": 0.116699, "key_mse_loss_layer_021": 0.108887, "key_mse_loss_layer_022": 0.115234, "key_mse_loss_layer_023": 0.113281, "key_mse_loss_layer_024": 0.086914, "key_mse_loss_layer_025": 0.083008, "key_mse_loss_layer_026": 0.09668, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.100586, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.057117, "kv_vq_loss": 0.000674, "learning_rate": 0.0008789684609279197, "loss": 0.057816, "step": 3280, "value_mse_loss_layer_000": 0.000759, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.009399, "value_mse_loss_layer_003": 0.016968, "value_mse_loss_layer_004": 0.015503, "value_mse_loss_layer_005": 0.014038, "value_mse_loss_layer_006": 0.01709, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.027466, "value_mse_loss_layer_010": 0.024048, "value_mse_loss_layer_011": 0.023804, "value_mse_loss_layer_012": 0.025513, "value_mse_loss_layer_013": 0.026367, "value_mse_loss_layer_014": 0.026978, "value_mse_loss_layer_015": 0.026611, "value_mse_loss_layer_016": 0.022705, "value_mse_loss_layer_017": 0.028931, "value_mse_loss_layer_018": 0.025269, "value_mse_loss_layer_019": 0.029785, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.037598, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.041504, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.057129, "value_mse_loss_layer_026": 0.051514, "value_mse_loss_layer_027": 0.062012, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.085938, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.101074, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.9e-05, "vq_loss_layer_003": 7.4e-05, "vq_loss_layer_004": 0.000173, "vq_loss_layer_005": 0.000137, "vq_loss_layer_006": 0.000271, "vq_loss_layer_007": 0.000311, "vq_loss_layer_008": 0.000303, "vq_loss_layer_009": 0.000473, "vq_loss_layer_010": 0.000404, "vq_loss_layer_011": 0.00034, "vq_loss_layer_012": 0.000671, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.00069, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000481, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000298, "vq_loss_layer_019": 0.000243, "vq_loss_layer_020": 0.000315, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000359, "vq_loss_layer_023": 0.000463, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.000671, "vq_loss_layer_026": 0.000973, "vq_loss_layer_027": 0.000797, "vq_loss_layer_028": 0.001274, "vq_loss_layer_029": 0.001457, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.007751 }, { "ce_loss": 2.321909, "epoch": 0.00329, "grad_norm": 0.004500821232795715, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.056918, "kv_vq_loss": 0.000652, "learning_rate": 0.0008792989744874936, "loss": 0.057596, "step": 3290, "value_mse_loss_layer_000": 0.000801, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.014221, "value_mse_loss_layer_004": 0.013733, "value_mse_loss_layer_005": 0.013489, "value_mse_loss_layer_006": 0.015137, "value_mse_loss_layer_007": 0.016968, "value_mse_loss_layer_008": 0.020508, "value_mse_loss_layer_009": 0.026367, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.025879, "value_mse_loss_layer_013": 0.0271, "value_mse_loss_layer_014": 0.028442, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.025269, "value_mse_loss_layer_017": 0.028442, "value_mse_loss_layer_018": 0.028564, "value_mse_loss_layer_019": 0.030884, "value_mse_loss_layer_020": 0.034668, "value_mse_loss_layer_021": 0.038574, "value_mse_loss_layer_022": 0.038086, "value_mse_loss_layer_023": 0.043457, "value_mse_loss_layer_024": 0.046875, "value_mse_loss_layer_025": 0.059326, "value_mse_loss_layer_026": 0.051758, "value_mse_loss_layer_027": 0.062988, "value_mse_loss_layer_028": 0.067871, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.09668, "value_mse_loss_layer_031": 0.104492, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 9.1e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000157, "vq_loss_layer_007": 0.000242, "vq_loss_layer_008": 0.000298, "vq_loss_layer_009": 0.000353, "vq_loss_layer_010": 0.000303, "vq_loss_layer_011": 0.000366, "vq_loss_layer_012": 0.00058, "vq_loss_layer_013": 0.000443, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000425, "vq_loss_layer_018": 0.000309, "vq_loss_layer_019": 0.000243, "vq_loss_layer_020": 0.000315, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000422, "vq_loss_layer_024": 0.000433, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.000824, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.001404, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.006531 }, { "ce_loss": 2.300007, "epoch": 0.0033, "grad_norm": 0.003764449618756771, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.056763, "kv_vq_loss": 0.000678, "learning_rate": 0.0008796284849694718, "loss": 0.057468, "step": 3300, "value_mse_loss_layer_000": 0.000778, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.009216, "value_mse_loss_layer_003": 0.013855, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.012573, "value_mse_loss_layer_006": 0.016235, "value_mse_loss_layer_007": 0.017334, "value_mse_loss_layer_008": 0.020142, "value_mse_loss_layer_009": 0.026245, "value_mse_loss_layer_010": 0.022583, "value_mse_loss_layer_011": 0.02478, "value_mse_loss_layer_012": 0.024536, "value_mse_loss_layer_013": 0.025269, "value_mse_loss_layer_014": 0.0271, "value_mse_loss_layer_015": 0.029541, "value_mse_loss_layer_016": 0.026367, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.027344, "value_mse_loss_layer_019": 0.030151, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.038086, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.040283, "value_mse_loss_layer_024": 0.044189, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.061523, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.100098, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000211, "vq_loss_layer_007": 0.000271, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.000423, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000591, "vq_loss_layer_017": 0.000523, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000215, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000484, "vq_loss_layer_022": 0.000351, "vq_loss_layer_023": 0.00036, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.003082, "vq_loss_layer_031": 0.005707 }, { "ce_loss": 2.332485, "epoch": 0.00331, "grad_norm": 0.004302978515625, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.056821, "kv_vq_loss": 0.000672, "learning_rate": 0.0008799569984439295, "loss": 0.057504, "step": 3310, "value_mse_loss_layer_000": 0.000793, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.015381, "value_mse_loss_layer_004": 0.013245, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.017334, "value_mse_loss_layer_008": 0.021118, "value_mse_loss_layer_009": 0.028076, "value_mse_loss_layer_010": 0.023438, "value_mse_loss_layer_011": 0.023438, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.026001, "value_mse_loss_layer_015": 0.027954, "value_mse_loss_layer_016": 0.023438, "value_mse_loss_layer_017": 0.028442, "value_mse_loss_layer_018": 0.026123, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.03125, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.036133, "value_mse_loss_layer_023": 0.044189, "value_mse_loss_layer_024": 0.044922, "value_mse_loss_layer_025": 0.052734, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.060303, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.091797, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.095215, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.000277, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000433, "vq_loss_layer_010": 0.000319, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.00058, "vq_loss_layer_013": 0.000553, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000456, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000372, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001617, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.006256 }, { "ce_loss": 2.318024, "epoch": 0.00332, "grad_norm": 0.004725560545921326, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.057504, "kv_vq_loss": 0.00069, "learning_rate": 0.000880284520926009, "loss": 0.058206, "step": 3320, "value_mse_loss_layer_000": 0.00079, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.008789, "value_mse_loss_layer_003": 0.014404, "value_mse_loss_layer_004": 0.012634, "value_mse_loss_layer_005": 0.01355, "value_mse_loss_layer_006": 0.01532, "value_mse_loss_layer_007": 0.017212, "value_mse_loss_layer_008": 0.020142, "value_mse_loss_layer_009": 0.026489, "value_mse_loss_layer_010": 0.022095, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.025635, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.030762, "value_mse_loss_layer_016": 0.026489, "value_mse_loss_layer_017": 0.028687, "value_mse_loss_layer_018": 0.026978, "value_mse_loss_layer_019": 0.030884, "value_mse_loss_layer_020": 0.035889, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.053955, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.0625, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.089355, "value_mse_loss_layer_030": 0.100098, "value_mse_loss_layer_031": 0.097656, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000168, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.000267, "vq_loss_layer_011": 0.000336, "vq_loss_layer_012": 0.000515, "vq_loss_layer_013": 0.000425, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000633, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000401, "vq_loss_layer_021": 0.000469, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000526, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.001312, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.005585 }, { "ce_loss": 2.29791, "epoch": 0.00333, "grad_norm": 0.003990760538727045, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.065918, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.07373, "key_mse_loss_layer_027": 0.072266, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.056863, "kv_vq_loss": 0.000687, "learning_rate": 0.0008806110583765797, "loss": 0.057571, "step": 3330, "value_mse_loss_layer_000": 0.000797, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.014099, "value_mse_loss_layer_004": 0.012817, "value_mse_loss_layer_005": 0.012817, "value_mse_loss_layer_006": 0.015991, "value_mse_loss_layer_007": 0.016968, "value_mse_loss_layer_008": 0.02063, "value_mse_loss_layer_009": 0.026367, "value_mse_loss_layer_010": 0.022949, "value_mse_loss_layer_011": 0.02417, "value_mse_loss_layer_012": 0.024902, "value_mse_loss_layer_013": 0.026245, "value_mse_loss_layer_014": 0.027222, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.026245, "value_mse_loss_layer_017": 0.030151, "value_mse_loss_layer_018": 0.02771, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.039062, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.046143, "value_mse_loss_layer_025": 0.055908, "value_mse_loss_layer_026": 0.046387, "value_mse_loss_layer_027": 0.057129, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.098633, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000181, "vq_loss_layer_007": 0.000239, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000294, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000395, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.000595, "vq_loss_layer_016": 0.000523, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.00029, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000401, "vq_loss_layer_024": 0.000368, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.006165 }, { "ce_loss": 2.271337, "epoch": 0.00334, "grad_norm": 0.0038694648537784815, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.062988, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.057257, "kv_vq_loss": 0.000689, "learning_rate": 0.000880936616702891, "loss": 0.057953, "step": 3340, "value_mse_loss_layer_000": 0.000805, "value_mse_loss_layer_001": 0.002258, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.015076, "value_mse_loss_layer_004": 0.01355, "value_mse_loss_layer_005": 0.012085, "value_mse_loss_layer_006": 0.014832, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.026611, "value_mse_loss_layer_010": 0.022827, "value_mse_loss_layer_011": 0.023804, "value_mse_loss_layer_012": 0.023315, "value_mse_loss_layer_013": 0.025269, "value_mse_loss_layer_014": 0.0271, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.030273, "value_mse_loss_layer_018": 0.026367, "value_mse_loss_layer_019": 0.031128, "value_mse_loss_layer_020": 0.034668, "value_mse_loss_layer_021": 0.040527, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.042969, "value_mse_loss_layer_024": 0.04541, "value_mse_loss_layer_025": 0.054688, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.062988, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.101074, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 0.000115, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000214, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000317, "vq_loss_layer_012": 0.000467, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.000479, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000456, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.005981 }, { "ce_loss": 2.291199, "epoch": 0.00335, "grad_norm": 0.003525090403854847, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.057278, "kv_vq_loss": 0.00069, "learning_rate": 0.0008812612017592111, "loss": 0.057999, "step": 3350, "value_mse_loss_layer_000": 0.000793, "value_mse_loss_layer_001": 0.002258, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.013916, "value_mse_loss_layer_004": 0.012573, "value_mse_loss_layer_005": 0.013062, "value_mse_loss_layer_006": 0.014709, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.021362, "value_mse_loss_layer_009": 0.025269, "value_mse_loss_layer_010": 0.021606, "value_mse_loss_layer_011": 0.023315, "value_mse_loss_layer_012": 0.02417, "value_mse_loss_layer_013": 0.025879, "value_mse_loss_layer_014": 0.027222, "value_mse_loss_layer_015": 0.028931, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.028442, "value_mse_loss_layer_018": 0.027954, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.046631, "value_mse_loss_layer_022": 0.038086, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.054932, "value_mse_loss_layer_026": 0.054199, "value_mse_loss_layer_027": 0.062988, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.09375, "value_mse_loss_layer_030": 0.083496, "value_mse_loss_layer_031": 0.098145, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.00015, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.000317, "vq_loss_layer_009": 0.000307, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000317, "vq_loss_layer_012": 0.000546, "vq_loss_layer_013": 0.000444, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.000412, "vq_loss_layer_018": 0.000269, "vq_loss_layer_019": 0.00023, "vq_loss_layer_020": 0.000239, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000767, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.005737 }, { "ce_loss": 2.269764, "epoch": 0.00336, "grad_norm": 0.004219789523631334, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.057574, "kv_vq_loss": 0.000683, "learning_rate": 0.0008815848193474608, "loss": 0.058282, "step": 3360, "value_mse_loss_layer_000": 0.000816, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.014282, "value_mse_loss_layer_004": 0.01239, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.014893, "value_mse_loss_layer_007": 0.018677, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.022949, "value_mse_loss_layer_012": 0.023315, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026978, "value_mse_loss_layer_015": 0.028564, "value_mse_loss_layer_016": 0.023926, "value_mse_loss_layer_017": 0.029297, "value_mse_loss_layer_018": 0.027954, "value_mse_loss_layer_019": 0.030884, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.04248, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.056396, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.064941, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.095215, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.100098, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000324, "vq_loss_layer_008": 0.000259, "vq_loss_layer_009": 0.000313, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000483, "vq_loss_layer_013": 0.000504, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000488, "vq_loss_layer_016": 0.000477, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.000278, "vq_loss_layer_019": 0.000224, "vq_loss_layer_020": 0.000307, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000303, "vq_loss_layer_023": 0.00042, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000694, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.291191, "epoch": 0.00337, "grad_norm": 0.003542119637131691, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.074707, "key_mse_loss_layer_030": 0.071777, "key_mse_loss_layer_031": 0.055908, "kv_mse_loss": 0.05726, "kv_vq_loss": 0.000684, "learning_rate": 0.0008819074752178344, "loss": 0.057953, "step": 3370, "value_mse_loss_layer_000": 0.000809, "value_mse_loss_layer_001": 0.002319, "value_mse_loss_layer_002": 0.010864, "value_mse_loss_layer_003": 0.015503, "value_mse_loss_layer_004": 0.014221, "value_mse_loss_layer_005": 0.01355, "value_mse_loss_layer_006": 0.016113, "value_mse_loss_layer_007": 0.0177, "value_mse_loss_layer_008": 0.021484, "value_mse_loss_layer_009": 0.027954, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.025391, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.028931, "value_mse_loss_layer_014": 0.029785, "value_mse_loss_layer_015": 0.032959, "value_mse_loss_layer_016": 0.026978, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.028198, "value_mse_loss_layer_019": 0.032959, "value_mse_loss_layer_020": 0.0354, "value_mse_loss_layer_021": 0.041016, "value_mse_loss_layer_022": 0.037109, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.047363, "value_mse_loss_layer_025": 0.061768, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.062012, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.089844, "value_mse_loss_layer_030": 0.085449, "value_mse_loss_layer_031": 0.109863, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 2.4e-05, "vq_loss_layer_002": 3.3e-05, "vq_loss_layer_003": 5.7e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.000196, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000313, "vq_loss_layer_009": 0.000372, "vq_loss_layer_010": 0.000368, "vq_loss_layer_011": 0.000391, "vq_loss_layer_012": 0.000671, "vq_loss_layer_013": 0.000534, "vq_loss_layer_014": 0.000698, "vq_loss_layer_015": 0.000771, "vq_loss_layer_016": 0.000637, "vq_loss_layer_017": 0.000553, "vq_loss_layer_018": 0.000332, "vq_loss_layer_019": 0.000298, "vq_loss_layer_020": 0.000307, "vq_loss_layer_021": 0.000637, "vq_loss_layer_022": 0.00034, "vq_loss_layer_023": 0.000467, "vq_loss_layer_024": 0.000444, "vq_loss_layer_025": 0.000595, "vq_loss_layer_026": 0.000675, "vq_loss_layer_027": 0.000763, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.00193, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.009155 }, { "ce_loss": 2.307435, "epoch": 0.00338, "grad_norm": 0.004324727226048708, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.057074, "kv_vq_loss": 0.000681, "learning_rate": 0.0008822291750694136, "loss": 0.057764, "step": 3380, "value_mse_loss_layer_000": 0.000805, "value_mse_loss_layer_001": 0.002258, "value_mse_loss_layer_002": 0.009155, "value_mse_loss_layer_003": 0.014282, "value_mse_loss_layer_004": 0.012634, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.015137, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.02124, "value_mse_loss_layer_009": 0.025391, "value_mse_loss_layer_010": 0.021606, "value_mse_loss_layer_011": 0.022949, "value_mse_loss_layer_012": 0.023438, "value_mse_loss_layer_013": 0.026611, "value_mse_loss_layer_014": 0.026978, "value_mse_loss_layer_015": 0.029907, "value_mse_loss_layer_016": 0.025269, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.041016, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.052246, "value_mse_loss_layer_025": 0.063965, "value_mse_loss_layer_026": 0.052246, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.074707, "value_mse_loss_layer_029": 0.094727, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.100098, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000225, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000477, "vq_loss_layer_013": 0.00045, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000486, "vq_loss_layer_017": 0.000423, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000418, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000372, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.001472, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005585 }, { "ce_loss": 2.312745, "epoch": 0.00339, "grad_norm": 0.005572822410613298, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.057269, "kv_vq_loss": 0.000665, "learning_rate": 0.0008825499245507705, "loss": 0.057944, "step": 3390, "value_mse_loss_layer_000": 0.000782, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.014343, "value_mse_loss_layer_004": 0.014954, "value_mse_loss_layer_005": 0.013123, "value_mse_loss_layer_006": 0.015625, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.02063, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.023926, "value_mse_loss_layer_011": 0.023804, "value_mse_loss_layer_012": 0.024536, "value_mse_loss_layer_013": 0.025879, "value_mse_loss_layer_014": 0.027222, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.030273, "value_mse_loss_layer_018": 0.031128, "value_mse_loss_layer_019": 0.030762, "value_mse_loss_layer_020": 0.033447, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.038818, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.047852, "value_mse_loss_layer_025": 0.064941, "value_mse_loss_layer_026": 0.050293, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.064941, "value_mse_loss_layer_029": 0.114258, "value_mse_loss_layer_030": 0.089355, "value_mse_loss_layer_031": 0.105469, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 0.00015, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000277, "vq_loss_layer_008": 0.000265, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000317, "vq_loss_layer_011": 0.000336, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000422, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000549, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.00037, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.000429, "vq_loss_layer_026": 0.000633, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001816, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.006256 }, { "ce_loss": 2.305173, "epoch": 0.0034, "grad_norm": 0.005159103311598301, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.09668, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.057126, "kv_vq_loss": 0.000696, "learning_rate": 0.0008828697292605636, "loss": 0.05784, "step": 3400, "value_mse_loss_layer_000": 0.000759, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.01532, "value_mse_loss_layer_004": 0.01355, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.015137, "value_mse_loss_layer_007": 0.017212, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.025879, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.023315, "value_mse_loss_layer_012": 0.025757, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026611, "value_mse_loss_layer_015": 0.031738, "value_mse_loss_layer_016": 0.028687, "value_mse_loss_layer_017": 0.028564, "value_mse_loss_layer_018": 0.0271, "value_mse_loss_layer_019": 0.030273, "value_mse_loss_layer_020": 0.033447, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.069824, "value_mse_loss_layer_024": 0.044434, "value_mse_loss_layer_025": 0.055908, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.061523, "value_mse_loss_layer_028": 0.066895, "value_mse_loss_layer_029": 0.102539, "value_mse_loss_layer_030": 0.087891, "value_mse_loss_layer_031": 0.100098, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000309, "vq_loss_layer_012": 0.000618, "vq_loss_layer_013": 0.000381, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000679, "vq_loss_layer_016": 0.000675, "vq_loss_layer_017": 0.000423, "vq_loss_layer_018": 0.000273, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000565, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.0009, "vq_loss_layer_024": 0.000383, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.00071, "vq_loss_layer_027": 0.000893, "vq_loss_layer_028": 0.001549, "vq_loss_layer_029": 0.003235, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.008972 }, { "ce_loss": 2.322614, "epoch": 0.00341, "grad_norm": 0.0030528628267347813, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.05697, "kv_vq_loss": 0.000673, "learning_rate": 0.0008831885947481244, "loss": 0.057657, "step": 3410, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.013794, "value_mse_loss_layer_006": 0.015076, "value_mse_loss_layer_007": 0.016846, "value_mse_loss_layer_008": 0.020508, "value_mse_loss_layer_009": 0.026245, "value_mse_loss_layer_010": 0.022461, "value_mse_loss_layer_011": 0.023438, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.026367, "value_mse_loss_layer_014": 0.027344, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.025391, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.025391, "value_mse_loss_layer_019": 0.03064, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.037598, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.04834, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.057617, "value_mse_loss_layer_028": 0.059814, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.079102, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 0.000123, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000343, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000307, "vq_loss_layer_012": 0.000534, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000507, "vq_loss_layer_017": 0.000408, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.000214, "vq_loss_layer_020": 0.000267, "vq_loss_layer_021": 0.000546, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000553, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001389, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.005707 }, { "ce_loss": 2.277108, "epoch": 0.00342, "grad_norm": 0.004801468923687935, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.057141, "kv_vq_loss": 0.000683, "learning_rate": 0.0008835065265140336, "loss": 0.057816, "step": 3420, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002243, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.016846, "value_mse_loss_layer_004": 0.013, "value_mse_loss_layer_005": 0.013123, "value_mse_loss_layer_006": 0.01532, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.020142, "value_mse_loss_layer_009": 0.025513, "value_mse_loss_layer_010": 0.026611, "value_mse_loss_layer_011": 0.023438, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.026245, "value_mse_loss_layer_015": 0.028442, "value_mse_loss_layer_016": 0.02417, "value_mse_loss_layer_017": 0.028198, "value_mse_loss_layer_018": 0.031128, "value_mse_loss_layer_019": 0.030273, "value_mse_loss_layer_020": 0.031982, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.045898, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.061035, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.09375, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.100098, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000155, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000307, "vq_loss_layer_010": 0.000376, "vq_loss_layer_011": 0.000336, "vq_loss_layer_012": 0.000473, "vq_loss_layer_013": 0.000385, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000481, "vq_loss_layer_016": 0.000477, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000397, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.006134 }, { "ce_loss": 2.292056, "epoch": 0.00343, "grad_norm": 0.003900132840499282, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.057498, "kv_vq_loss": 0.00072, "learning_rate": 0.0008838235300106925, "loss": 0.058246, "step": 3430, "value_mse_loss_layer_000": 0.000786, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.015503, "value_mse_loss_layer_004": 0.015137, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.015564, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.026367, "value_mse_loss_layer_010": 0.021729, "value_mse_loss_layer_011": 0.022827, "value_mse_loss_layer_012": 0.023315, "value_mse_loss_layer_013": 0.027588, "value_mse_loss_layer_014": 0.026611, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.029541, "value_mse_loss_layer_018": 0.028931, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.045898, "value_mse_loss_layer_025": 0.060791, "value_mse_loss_layer_026": 0.049805, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.067383, "value_mse_loss_layer_029": 0.092285, "value_mse_loss_layer_030": 0.09082, "value_mse_loss_layer_031": 0.104004, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 0.000176, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000181, "vq_loss_layer_007": 0.000223, "vq_loss_layer_008": 0.00023, "vq_loss_layer_009": 0.000376, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000467, "vq_loss_layer_013": 0.000507, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.000307, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000483, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001442, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.006287 }, { "ce_loss": 2.305309, "epoch": 0.00344, "grad_norm": 0.002968770219013095, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.056796, "kv_vq_loss": 0.000676, "learning_rate": 0.0008841396106428825, "loss": 0.057474, "step": 3440, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002258, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.01416, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.015503, "value_mse_loss_layer_007": 0.017212, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.025879, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.023682, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.026733, "value_mse_loss_layer_015": 0.028809, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.029419, "value_mse_loss_layer_018": 0.026611, "value_mse_loss_layer_019": 0.030884, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.040039, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.054443, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.058838, "value_mse_loss_layer_029": 0.080566, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.097656, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 0.000112, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000234, "vq_loss_layer_008": 0.000298, "vq_loss_layer_009": 0.000317, "vq_loss_layer_010": 0.000324, "vq_loss_layer_011": 0.000322, "vq_loss_layer_012": 0.000706, "vq_loss_layer_013": 0.000471, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000546, "vq_loss_layer_017": 0.0005, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000286, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000425, "vq_loss_layer_023": 0.000448, "vq_loss_layer_024": 0.000431, "vq_loss_layer_025": 0.000587, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001556, "vq_loss_layer_030": 0.002838, "vq_loss_layer_031": 0.007172 }, { "ce_loss": 2.317923, "epoch": 0.00345, "grad_norm": 0.0040840329602360725, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.05701, "kv_vq_loss": 0.000675, "learning_rate": 0.0008844547737683184, "loss": 0.057684, "step": 3450, "value_mse_loss_layer_000": 0.000786, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.012451, "value_mse_loss_layer_005": 0.012634, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.023193, "value_mse_loss_layer_011": 0.025146, "value_mse_loss_layer_012": 0.02356, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.029541, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.031128, "value_mse_loss_layer_020": 0.031982, "value_mse_loss_layer_021": 0.039307, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.042969, "value_mse_loss_layer_024": 0.045898, "value_mse_loss_layer_025": 0.054443, "value_mse_loss_layer_026": 0.047119, "value_mse_loss_layer_027": 0.060059, "value_mse_loss_layer_028": 0.069824, "value_mse_loss_layer_029": 0.099121, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.095703, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000228, "vq_loss_layer_008": 0.000231, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000446, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000481, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000469, "vq_loss_layer_017": 0.000469, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.000221, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000553, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000418, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.001328, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.005463 }, { "ce_loss": 2.307717, "epoch": 0.00346, "grad_norm": 0.0034400061704218388, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.057129, "kv_vq_loss": 0.000681, "learning_rate": 0.0008847690246981941, "loss": 0.057822, "step": 3460, "value_mse_loss_layer_000": 0.000786, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008789, "value_mse_loss_layer_003": 0.014587, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.014099, "value_mse_loss_layer_006": 0.015503, "value_mse_loss_layer_007": 0.017212, "value_mse_loss_layer_008": 0.02063, "value_mse_loss_layer_009": 0.026489, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.024536, "value_mse_loss_layer_013": 0.02832, "value_mse_loss_layer_014": 0.027222, "value_mse_loss_layer_015": 0.031128, "value_mse_loss_layer_016": 0.027222, "value_mse_loss_layer_017": 0.031494, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.031494, "value_mse_loss_layer_020": 0.044189, "value_mse_loss_layer_021": 0.038574, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.057617, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.066895, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.091797, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.098145, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 0.000129, "vq_loss_layer_006": 0.000182, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000332, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000336, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.000633, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000599, "vq_loss_layer_016": 0.000576, "vq_loss_layer_017": 0.000603, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000218, "vq_loss_layer_020": 0.000336, "vq_loss_layer_021": 0.000492, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000374, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000387, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000759, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.005951 }, { "ce_loss": 2.369967, "epoch": 0.00347, "grad_norm": 0.004279418848454952, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.056708, "kv_vq_loss": 0.000664, "learning_rate": 0.0008850823686977183, "loss": 0.057397, "step": 3470, "value_mse_loss_layer_000": 0.000801, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.008606, "value_mse_loss_layer_003": 0.013855, "value_mse_loss_layer_004": 0.013611, "value_mse_loss_layer_005": 0.013, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.016479, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.023071, "value_mse_loss_layer_012": 0.026489, "value_mse_loss_layer_013": 0.025513, "value_mse_loss_layer_014": 0.030884, "value_mse_loss_layer_015": 0.029907, "value_mse_loss_layer_016": 0.025513, "value_mse_loss_layer_017": 0.029175, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.031494, "value_mse_loss_layer_020": 0.03418, "value_mse_loss_layer_021": 0.039307, "value_mse_loss_layer_022": 0.042236, "value_mse_loss_layer_023": 0.04541, "value_mse_loss_layer_024": 0.053223, "value_mse_loss_layer_025": 0.054932, "value_mse_loss_layer_026": 0.052002, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.097656, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 0.000102, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.000164, "vq_loss_layer_007": 0.000239, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000309, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000286, "vq_loss_layer_012": 0.000683, "vq_loss_layer_013": 0.000427, "vq_loss_layer_014": 0.000576, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000484, "vq_loss_layer_018": 0.000222, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000441, "vq_loss_layer_023": 0.000391, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.001289, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005737 }, { "ce_loss": 2.322685, "epoch": 0.00348, "grad_norm": 0.005323134828358889, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.057007, "kv_vq_loss": 0.000677, "learning_rate": 0.0008853948109866451, "loss": 0.057712, "step": 3480, "value_mse_loss_layer_000": 0.000771, "value_mse_loss_layer_001": 0.002243, "value_mse_loss_layer_002": 0.009766, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.013611, "value_mse_loss_layer_005": 0.013428, "value_mse_loss_layer_006": 0.016235, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.02063, "value_mse_loss_layer_009": 0.027344, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.024048, "value_mse_loss_layer_012": 0.024902, "value_mse_loss_layer_013": 0.026245, "value_mse_loss_layer_014": 0.027954, "value_mse_loss_layer_015": 0.031006, "value_mse_loss_layer_016": 0.025269, "value_mse_loss_layer_017": 0.030029, "value_mse_loss_layer_018": 0.031738, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.040283, "value_mse_loss_layer_022": 0.039062, "value_mse_loss_layer_023": 0.042236, "value_mse_loss_layer_024": 0.049072, "value_mse_loss_layer_025": 0.059814, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.064941, "value_mse_loss_layer_028": 0.07959, "value_mse_loss_layer_029": 0.108887, "value_mse_loss_layer_030": 0.094727, "value_mse_loss_layer_031": 0.104492, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000208, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000385, "vq_loss_layer_010": 0.000301, "vq_loss_layer_011": 0.00033, "vq_loss_layer_012": 0.000534, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.000587, "vq_loss_layer_015": 0.000641, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.00066, "vq_loss_layer_018": 0.00036, "vq_loss_layer_019": 0.000305, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000607, "vq_loss_layer_022": 0.00042, "vq_loss_layer_023": 0.00034, "vq_loss_layer_024": 0.000465, "vq_loss_layer_025": 0.000523, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000889, "vq_loss_layer_028": 0.001495, "vq_loss_layer_029": 0.002228, "vq_loss_layer_030": 0.003052, "vq_loss_layer_031": 0.007812 }, { "ce_loss": 2.262829, "epoch": 0.00349, "grad_norm": 0.004698898643255234, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.057257, "kv_vq_loss": 0.000693, "learning_rate": 0.0008857063567397947, "loss": 0.057971, "step": 3490, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.012939, "value_mse_loss_layer_005": 0.013977, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.025269, "value_mse_loss_layer_010": 0.026489, "value_mse_loss_layer_011": 0.023071, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025757, "value_mse_loss_layer_014": 0.027222, "value_mse_loss_layer_015": 0.028687, "value_mse_loss_layer_016": 0.026489, "value_mse_loss_layer_017": 0.02832, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.030273, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.035645, "value_mse_loss_layer_023": 0.049561, "value_mse_loss_layer_024": 0.043213, "value_mse_loss_layer_025": 0.054932, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.058838, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.089844, "value_mse_loss_layer_030": 0.080078, "value_mse_loss_layer_031": 0.09668, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000127, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000301, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000397, "vq_loss_layer_011": 0.000301, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000603, "vq_loss_layer_017": 0.000441, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.000225, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000603, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000629, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000744, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001526, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.007019 }, { "ce_loss": 2.289548, "epoch": 0.0035, "grad_norm": 0.005115651059895754, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.056967, "kv_vq_loss": 0.000673, "learning_rate": 0.0008860170110875688, "loss": 0.057657, "step": 3500, "value_mse_loss_layer_000": 0.000782, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.008789, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.013, "value_mse_loss_layer_005": 0.013184, "value_mse_loss_layer_006": 0.015991, "value_mse_loss_layer_007": 0.016968, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.026367, "value_mse_loss_layer_010": 0.022949, "value_mse_loss_layer_011": 0.023804, "value_mse_loss_layer_012": 0.033936, "value_mse_loss_layer_013": 0.026245, "value_mse_loss_layer_014": 0.027344, "value_mse_loss_layer_015": 0.030762, "value_mse_loss_layer_016": 0.026489, "value_mse_loss_layer_017": 0.030884, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.036621, "value_mse_loss_layer_021": 0.040771, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.044434, "value_mse_loss_layer_024": 0.045166, "value_mse_loss_layer_025": 0.062012, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.05957, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.111328, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.00019, "vq_loss_layer_007": 0.000233, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000317, "vq_loss_layer_010": 0.000294, "vq_loss_layer_011": 0.000322, "vq_loss_layer_012": 0.001091, "vq_loss_layer_013": 0.000422, "vq_loss_layer_014": 0.000515, "vq_loss_layer_015": 0.000607, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000553, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000241, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000416, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000427, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.006866 }, { "ce_loss": 2.28872, "epoch": 0.00351, "grad_norm": 0.003619731869548559, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.081543, "kv_mse_loss": 0.056778, "kv_vq_loss": 0.00068, "learning_rate": 0.000886326779116456, "loss": 0.05748, "step": 3510, "value_mse_loss_layer_000": 0.000778, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014587, "value_mse_loss_layer_004": 0.012451, "value_mse_loss_layer_005": 0.012329, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.021729, "value_mse_loss_layer_011": 0.022827, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.029175, "value_mse_loss_layer_016": 0.024658, "value_mse_loss_layer_017": 0.029175, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.031494, "value_mse_loss_layer_020": 0.033447, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.039062, "value_mse_loss_layer_023": 0.04248, "value_mse_loss_layer_024": 0.048096, "value_mse_loss_layer_025": 0.054688, "value_mse_loss_layer_026": 0.048096, "value_mse_loss_layer_027": 0.067871, "value_mse_loss_layer_028": 0.066895, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.087402, "value_mse_loss_layer_031": 0.09668, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000168, "vq_loss_layer_007": 0.000227, "vq_loss_layer_008": 0.000241, "vq_loss_layer_009": 0.000326, "vq_loss_layer_010": 0.000267, "vq_loss_layer_011": 0.000307, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000406, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000483, "vq_loss_layer_017": 0.000439, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000443, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000357, "vq_loss_layer_024": 0.000408, "vq_loss_layer_025": 0.000435, "vq_loss_layer_026": 0.000732, "vq_loss_layer_027": 0.001045, "vq_loss_layer_028": 0.001411, "vq_loss_layer_029": 0.003601, "vq_loss_layer_030": 0.003799, "vq_loss_layer_031": 0.008118 }, { "ce_loss": 2.270943, "epoch": 0.00352, "grad_norm": 0.0037706715520471334, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.056909, "kv_vq_loss": 0.000685, "learning_rate": 0.0008866356658695326, "loss": 0.057602, "step": 3520, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.013794, "value_mse_loss_layer_004": 0.013428, "value_mse_loss_layer_005": 0.013367, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.020996, "value_mse_loss_layer_009": 0.025757, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.02478, "value_mse_loss_layer_013": 0.026489, "value_mse_loss_layer_014": 0.0271, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.024658, "value_mse_loss_layer_017": 0.028442, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.029419, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.037598, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.039551, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.059082, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.05957, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.087402, "value_mse_loss_layer_030": 0.07959, "value_mse_loss_layer_031": 0.095215, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 9.9e-05, "vq_loss_layer_005": 0.000115, "vq_loss_layer_006": 0.000178, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000311, "vq_loss_layer_009": 0.000324, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.000549, "vq_loss_layer_013": 0.000492, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000565, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000452, "vq_loss_layer_018": 0.000286, "vq_loss_layer_019": 0.000214, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000568, "vq_loss_layer_022": 0.000387, "vq_loss_layer_023": 0.000437, "vq_loss_layer_024": 0.00037, "vq_loss_layer_025": 0.000538, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000832, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.00589 }, { "ce_loss": 2.343768, "epoch": 0.00353, "grad_norm": 0.004377464298158884, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.056201, "kv_vq_loss": 0.000659, "learning_rate": 0.0008869436763469556, "loss": 0.056866, "step": 3530, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.014038, "value_mse_loss_layer_004": 0.012512, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.026978, "value_mse_loss_layer_010": 0.023193, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025269, "value_mse_loss_layer_014": 0.026611, "value_mse_loss_layer_015": 0.030029, "value_mse_loss_layer_016": 0.028442, "value_mse_loss_layer_017": 0.028931, "value_mse_loss_layer_018": 0.034424, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.031982, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.055176, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.060547, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.091309, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.100586, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000157, "vq_loss_layer_007": 0.00025, "vq_loss_layer_008": 0.000239, "vq_loss_layer_009": 0.00037, "vq_loss_layer_010": 0.000263, "vq_loss_layer_011": 0.000416, "vq_loss_layer_012": 0.000483, "vq_loss_layer_013": 0.000395, "vq_loss_layer_014": 0.000479, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.00061, "vq_loss_layer_017": 0.000431, "vq_loss_layer_018": 0.000353, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000397, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.001358, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.006409 }, { "ce_loss": 2.280704, "epoch": 0.00354, "grad_norm": 0.0038142227567732334, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.057062, "kv_vq_loss": 0.000685, "learning_rate": 0.0008872508155064468, "loss": 0.057764, "step": 3540, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.013611, "value_mse_loss_layer_004": 0.012939, "value_mse_loss_layer_005": 0.014282, "value_mse_loss_layer_006": 0.015503, "value_mse_loss_layer_007": 0.017822, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.025879, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.022949, "value_mse_loss_layer_012": 0.033203, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.02832, "value_mse_loss_layer_016": 0.024414, "value_mse_loss_layer_017": 0.029175, "value_mse_loss_layer_018": 0.025879, "value_mse_loss_layer_019": 0.030762, "value_mse_loss_layer_020": 0.037842, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.04248, "value_mse_loss_layer_024": 0.047607, "value_mse_loss_layer_025": 0.053955, "value_mse_loss_layer_026": 0.045654, "value_mse_loss_layer_027": 0.062256, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.095703, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.00019, "vq_loss_layer_007": 0.00032, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000326, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000307, "vq_loss_layer_012": 0.001236, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.00053, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000469, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000265, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000416, "vq_loss_layer_024": 0.000433, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.00164, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.005615 }, { "ce_loss": 2.269772, "epoch": 0.00355, "grad_norm": 0.0034001620952039957, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.057047, "kv_vq_loss": 0.000662, "learning_rate": 0.0008875570882637733, "loss": 0.057706, "step": 3550, "value_mse_loss_layer_000": 0.000771, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.014038, "value_mse_loss_layer_004": 0.012573, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.02356, "value_mse_loss_layer_011": 0.023193, "value_mse_loss_layer_012": 0.02417, "value_mse_loss_layer_013": 0.026855, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.029907, "value_mse_loss_layer_016": 0.025146, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.026978, "value_mse_loss_layer_019": 0.030762, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.039795, "value_mse_loss_layer_022": 0.038086, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.04248, "value_mse_loss_layer_025": 0.053955, "value_mse_loss_layer_026": 0.046143, "value_mse_loss_layer_027": 0.061523, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.092285, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000178, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000298, "vq_loss_layer_010": 0.000309, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.000515, "vq_loss_layer_013": 0.000488, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000561, "vq_loss_layer_016": 0.000496, "vq_loss_layer_017": 0.000471, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000242, "vq_loss_layer_020": 0.000267, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.005188 }, { "ce_loss": 2.295238, "epoch": 0.00356, "grad_norm": 0.003533037379384041, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.056131, "kv_vq_loss": 0.000669, "learning_rate": 0.0008878624994932186, "loss": 0.056824, "step": 3560, "value_mse_loss_layer_000": 0.000755, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.013611, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.015137, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.019653, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.024048, "value_mse_loss_layer_013": 0.026123, "value_mse_loss_layer_014": 0.027588, "value_mse_loss_layer_015": 0.028931, "value_mse_loss_layer_016": 0.023926, "value_mse_loss_layer_017": 0.030029, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.029053, "value_mse_loss_layer_020": 0.031128, "value_mse_loss_layer_021": 0.037598, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.04126, "value_mse_loss_layer_024": 0.052246, "value_mse_loss_layer_025": 0.05127, "value_mse_loss_layer_026": 0.048096, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.084473, "value_mse_loss_layer_030": 0.079102, "value_mse_loss_layer_031": 0.09668, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000162, "vq_loss_layer_007": 0.000229, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.000309, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000383, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.00045, "vq_loss_layer_014": 0.000576, "vq_loss_layer_015": 0.000511, "vq_loss_layer_016": 0.000486, "vq_loss_layer_017": 0.000668, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000212, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000387, "vq_loss_layer_024": 0.000437, "vq_loss_layer_025": 0.000389, "vq_loss_layer_026": 0.000687, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001549, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.006317 }, { "ce_loss": 2.260482, "epoch": 0.00357, "grad_norm": 0.0052546109072864056, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.05715, "kv_vq_loss": 0.000681, "learning_rate": 0.0008881670540280482, "loss": 0.057837, "step": 3570, "value_mse_loss_layer_000": 0.000759, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.011902, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.030151, "value_mse_loss_layer_010": 0.022583, "value_mse_loss_layer_011": 0.023804, "value_mse_loss_layer_012": 0.023926, "value_mse_loss_layer_013": 0.025269, "value_mse_loss_layer_014": 0.026611, "value_mse_loss_layer_015": 0.028931, "value_mse_loss_layer_016": 0.023926, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.045166, "value_mse_loss_layer_022": 0.037109, "value_mse_loss_layer_023": 0.05127, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.056152, "value_mse_loss_layer_026": 0.045898, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.112305, "value_mse_loss_layer_030": 0.083496, "value_mse_loss_layer_031": 0.096191, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000235, "vq_loss_layer_007": 0.000299, "vq_loss_layer_008": 0.000224, "vq_loss_layer_009": 0.000645, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000313, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000483, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000259, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000725, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000851, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.2993, "epoch": 0.00358, "grad_norm": 0.004092501010745764, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.057053, "kv_vq_loss": 0.000677, "learning_rate": 0.0008884707566609684, "loss": 0.057755, "step": 3580, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.014404, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.01709, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.022583, "value_mse_loss_layer_012": 0.024658, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.028931, "value_mse_loss_layer_016": 0.024658, "value_mse_loss_layer_017": 0.028931, "value_mse_loss_layer_018": 0.032471, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.040771, "value_mse_loss_layer_022": 0.040771, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.049561, "value_mse_loss_layer_025": 0.058838, "value_mse_loss_layer_026": 0.063965, "value_mse_loss_layer_027": 0.068359, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.101074, "value_mse_loss_layer_030": 0.095703, "value_mse_loss_layer_031": 0.11084, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000234, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000303, "vq_loss_layer_011": 0.000294, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000456, "vq_loss_layer_018": 0.000511, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000374, "vq_loss_layer_025": 0.000454, "vq_loss_layer_026": 0.001236, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.001839, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.007507 }, { "ce_loss": 2.268837, "epoch": 0.00359, "grad_norm": 0.004008146468549967, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.056555, "kv_vq_loss": 0.000657, "learning_rate": 0.0008887736121445796, "loss": 0.05722, "step": 3590, "value_mse_loss_layer_000": 0.000759, "value_mse_loss_layer_001": 0.002243, "value_mse_loss_layer_002": 0.009216, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.012756, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.025513, "value_mse_loss_layer_010": 0.021729, "value_mse_loss_layer_011": 0.023682, "value_mse_loss_layer_012": 0.024536, "value_mse_loss_layer_013": 0.025513, "value_mse_loss_layer_014": 0.028809, "value_mse_loss_layer_015": 0.029907, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.031982, "value_mse_loss_layer_018": 0.028076, "value_mse_loss_layer_019": 0.030518, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.054443, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.061035, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.098633, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000157, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000238, "vq_loss_layer_009": 0.000286, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000334, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.00042, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.00058, "vq_loss_layer_017": 0.00061, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000395, "vq_loss_layer_024": 0.000404, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.001099, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.006348 }, { "ce_loss": 2.281533, "epoch": 0.0036, "grad_norm": 0.0031887448858469725, "key_mse_loss_layer_000": 0.004395, "key_mse_loss_layer_001": 0.012268, "key_mse_loss_layer_002": 0.061279, "key_mse_loss_layer_003": 0.05542, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.068848, "key_mse_loss_layer_006": 0.077637, "key_mse_loss_layer_007": 0.085449, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.109375, "key_mse_loss_layer_011": 0.107422, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.110352, "key_mse_loss_layer_018": 0.116699, "key_mse_loss_layer_019": 0.101074, "key_mse_loss_layer_020": 0.112305, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.082031, "key_mse_loss_layer_026": 0.098145, "key_mse_loss_layer_027": 0.094238, "key_mse_loss_layer_028": 0.100098, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.098145, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.056622, "kv_vq_loss": 0.000656, "learning_rate": 0.0008890756251918216, "loss": 0.057272, "step": 3600, "value_mse_loss_layer_000": 0.000778, "value_mse_loss_layer_001": 0.002243, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.015869, "value_mse_loss_layer_004": 0.014221, "value_mse_loss_layer_005": 0.014709, "value_mse_loss_layer_006": 0.016357, "value_mse_loss_layer_007": 0.018188, "value_mse_loss_layer_008": 0.021118, "value_mse_loss_layer_009": 0.026611, "value_mse_loss_layer_010": 0.02356, "value_mse_loss_layer_011": 0.024902, "value_mse_loss_layer_012": 0.025757, "value_mse_loss_layer_013": 0.029175, "value_mse_loss_layer_014": 0.028442, "value_mse_loss_layer_015": 0.029175, "value_mse_loss_layer_016": 0.026001, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.026245, "value_mse_loss_layer_019": 0.031982, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.037109, "value_mse_loss_layer_023": 0.040039, "value_mse_loss_layer_024": 0.047607, "value_mse_loss_layer_025": 0.057129, "value_mse_loss_layer_026": 0.049072, "value_mse_loss_layer_027": 0.071289, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.092773, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.102051, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 5.1e-05, "vq_loss_layer_004": 0.000132, "vq_loss_layer_005": 0.000177, "vq_loss_layer_006": 0.000241, "vq_loss_layer_007": 0.000332, "vq_loss_layer_008": 0.000317, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.00034, "vq_loss_layer_011": 0.000374, "vq_loss_layer_012": 0.000595, "vq_loss_layer_013": 0.000641, "vq_loss_layer_014": 0.000626, "vq_loss_layer_015": 0.000629, "vq_loss_layer_016": 0.000633, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.000515, "vq_loss_layer_019": 0.000427, "vq_loss_layer_020": 0.000385, "vq_loss_layer_021": 0.000645, "vq_loss_layer_022": 0.000435, "vq_loss_layer_023": 0.00041, "vq_loss_layer_024": 0.000538, "vq_loss_layer_025": 0.000607, "vq_loss_layer_026": 0.000908, "vq_loss_layer_027": 0.001236, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.00174, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.00705 }, { "ce_loss": 2.308818, "epoch": 0.00361, "grad_norm": 0.004841076675802469, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.0565, "kv_vq_loss": 0.000656, "learning_rate": 0.0008893768004764144, "loss": 0.057166, "step": 3610, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008545, "value_mse_loss_layer_003": 0.014038, "value_mse_loss_layer_004": 0.01239, "value_mse_loss_layer_005": 0.012573, "value_mse_loss_layer_006": 0.015442, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.019409, "value_mse_loss_layer_009": 0.028809, "value_mse_loss_layer_010": 0.023071, "value_mse_loss_layer_011": 0.023071, "value_mse_loss_layer_012": 0.023071, "value_mse_loss_layer_013": 0.025757, "value_mse_loss_layer_014": 0.026978, "value_mse_loss_layer_015": 0.029297, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.030396, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.037109, "value_mse_loss_layer_023": 0.04126, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.055176, "value_mse_loss_layer_026": 0.059814, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.087402, "value_mse_loss_layer_030": 0.07959, "value_mse_loss_layer_031": 0.096191, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000204, "vq_loss_layer_007": 0.000234, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.000486, "vq_loss_layer_010": 0.000299, "vq_loss_layer_011": 0.000313, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.000446, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.000452, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000219, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000713, "vq_loss_layer_022": 0.000374, "vq_loss_layer_023": 0.000437, "vq_loss_layer_024": 0.000376, "vq_loss_layer_025": 0.000452, "vq_loss_layer_026": 0.001236, "vq_loss_layer_027": 0.000694, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.006195 }, { "ce_loss": 2.297009, "epoch": 0.00362, "grad_norm": 0.003929090220481157, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.057028, "kv_vq_loss": 0.000699, "learning_rate": 0.0008896771426332913, "loss": 0.057733, "step": 3620, "value_mse_loss_layer_000": 0.00079, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.014893, "value_mse_loss_layer_004": 0.013916, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.014832, "value_mse_loss_layer_007": 0.01709, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.025391, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.022827, "value_mse_loss_layer_012": 0.032471, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.030029, "value_mse_loss_layer_016": 0.025024, "value_mse_loss_layer_017": 0.028076, "value_mse_loss_layer_018": 0.0271, "value_mse_loss_layer_019": 0.029541, "value_mse_loss_layer_020": 0.031982, "value_mse_loss_layer_021": 0.038574, "value_mse_loss_layer_022": 0.036377, "value_mse_loss_layer_023": 0.040283, "value_mse_loss_layer_024": 0.042969, "value_mse_loss_layer_025": 0.056885, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.059814, "value_mse_loss_layer_028": 0.064941, "value_mse_loss_layer_029": 0.101562, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 0.000134, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000259, "vq_loss_layer_008": 0.000238, "vq_loss_layer_009": 0.000307, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000299, "vq_loss_layer_012": 0.000946, "vq_loss_layer_013": 0.000408, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000228, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000483, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001625, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.334144, "epoch": 0.00363, "grad_norm": 0.005204536486417055, "key_mse_loss_layer_000": 0.00383, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.056653, "kv_vq_loss": 0.00067, "learning_rate": 0.0008899766562590281, "loss": 0.057346, "step": 3630, "value_mse_loss_layer_000": 0.000786, "value_mse_loss_layer_001": 0.002258, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.014587, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.014465, "value_mse_loss_layer_006": 0.01532, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.025269, "value_mse_loss_layer_010": 0.021606, "value_mse_loss_layer_011": 0.022949, "value_mse_loss_layer_012": 0.02771, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.028198, "value_mse_loss_layer_016": 0.025391, "value_mse_loss_layer_017": 0.029175, "value_mse_loss_layer_018": 0.026978, "value_mse_loss_layer_019": 0.034912, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.039307, "value_mse_loss_layer_022": 0.038574, "value_mse_loss_layer_023": 0.045654, "value_mse_loss_layer_024": 0.050293, "value_mse_loss_layer_025": 0.057373, "value_mse_loss_layer_026": 0.052246, "value_mse_loss_layer_027": 0.095215, "value_mse_loss_layer_028": 0.067871, "value_mse_loss_layer_029": 0.110352, "value_mse_loss_layer_030": 0.09375, "value_mse_loss_layer_031": 0.106445, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 0.000124, "vq_loss_layer_006": 0.000168, "vq_loss_layer_007": 0.00023, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000282, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.00069, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.000534, "vq_loss_layer_015": 0.000542, "vq_loss_layer_016": 0.00061, "vq_loss_layer_017": 0.000622, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000282, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000595, "vq_loss_layer_022": 0.000402, "vq_loss_layer_023": 0.000414, "vq_loss_layer_024": 0.000462, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.001335, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.002213, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.00705 }, { "ce_loss": 2.273297, "epoch": 0.00364, "grad_norm": 0.0038251481018960476, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.056653, "kv_vq_loss": 0.000657, "learning_rate": 0.0008902753459122637, "loss": 0.057303, "step": 3640, "value_mse_loss_layer_000": 0.000774, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.008606, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.012573, "value_mse_loss_layer_005": 0.012268, "value_mse_loss_layer_006": 0.014709, "value_mse_loss_layer_007": 0.016479, "value_mse_loss_layer_008": 0.019409, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.021118, "value_mse_loss_layer_011": 0.022827, "value_mse_loss_layer_012": 0.025146, "value_mse_loss_layer_013": 0.025635, "value_mse_loss_layer_014": 0.027588, "value_mse_loss_layer_015": 0.02771, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.028564, "value_mse_loss_layer_018": 0.026978, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.037109, "value_mse_loss_layer_021": 0.037109, "value_mse_loss_layer_022": 0.036377, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.045898, "value_mse_loss_layer_025": 0.056641, "value_mse_loss_layer_026": 0.046143, "value_mse_loss_layer_027": 0.063965, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.086914, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.099121, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000362, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000299, "vq_loss_layer_012": 0.000542, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.000603, "vq_loss_layer_015": 0.000488, "vq_loss_layer_016": 0.000488, "vq_loss_layer_017": 0.000572, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000408, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000439, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.00071, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.006073 }, { "ce_loss": 2.273913, "epoch": 0.00365, "grad_norm": 0.004708140157163143, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.056903, "kv_vq_loss": 0.000664, "learning_rate": 0.0008905732161141185, "loss": 0.057571, "step": 3650, "value_mse_loss_layer_000": 0.000751, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.013611, "value_mse_loss_layer_004": 0.013245, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.026367, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.022949, "value_mse_loss_layer_012": 0.024414, "value_mse_loss_layer_013": 0.025269, "value_mse_loss_layer_014": 0.025879, "value_mse_loss_layer_015": 0.028809, "value_mse_loss_layer_016": 0.025757, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.029541, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.040527, "value_mse_loss_layer_023": 0.03833, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.061035, "value_mse_loss_layer_026": 0.053711, "value_mse_loss_layer_027": 0.062012, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.098633, "value_mse_loss_layer_030": 0.07959, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 9.4e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000193, "vq_loss_layer_007": 0.000254, "vq_loss_layer_008": 0.000246, "vq_loss_layer_009": 0.00036, "vq_loss_layer_010": 0.000271, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.000473, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000507, "vq_loss_layer_017": 0.000462, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000224, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000458, "vq_loss_layer_023": 0.000355, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000622, "vq_loss_layer_026": 0.000919, "vq_loss_layer_027": 0.000732, "vq_loss_layer_028": 0.000931, "vq_loss_layer_029": 0.001472, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.006134 }, { "ce_loss": 2.320388, "epoch": 0.00366, "grad_norm": 0.004223922733217478, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.056607, "kv_vq_loss": 0.000643, "learning_rate": 0.0008908702713486025, "loss": 0.057236, "step": 3660, "value_mse_loss_layer_000": 0.00079, "value_mse_loss_layer_001": 0.002274, "value_mse_loss_layer_002": 0.009094, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.013, "value_mse_loss_layer_005": 0.014282, "value_mse_loss_layer_006": 0.015442, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.025269, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.022583, "value_mse_loss_layer_012": 0.023438, "value_mse_loss_layer_013": 0.024536, "value_mse_loss_layer_014": 0.026001, "value_mse_loss_layer_015": 0.028809, "value_mse_loss_layer_016": 0.025146, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.027344, "value_mse_loss_layer_019": 0.031982, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.044189, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.062012, "value_mse_loss_layer_026": 0.049805, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.097168, "value_mse_loss_layer_030": 0.087402, "value_mse_loss_layer_031": 0.097656, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000243, "vq_loss_layer_008": 0.000273, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000271, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.000462, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000439, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000218, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000296, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.000626, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001549, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.00589 }, { "ce_loss": 2.337419, "epoch": 0.00367, "grad_norm": 0.003864763770252466, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.056766, "kv_vq_loss": 0.000652, "learning_rate": 0.0008911665160630223, "loss": 0.057419, "step": 3670, "value_mse_loss_layer_000": 0.00074, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.00885, "value_mse_loss_layer_003": 0.014038, "value_mse_loss_layer_004": 0.012817, "value_mse_loss_layer_005": 0.013123, "value_mse_loss_layer_006": 0.015015, "value_mse_loss_layer_007": 0.016968, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.026489, "value_mse_loss_layer_010": 0.0271, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.026978, "value_mse_loss_layer_013": 0.027832, "value_mse_loss_layer_014": 0.028198, "value_mse_loss_layer_015": 0.030029, "value_mse_loss_layer_016": 0.024414, "value_mse_loss_layer_017": 0.027954, "value_mse_loss_layer_018": 0.025391, "value_mse_loss_layer_019": 0.031128, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.041748, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.041016, "value_mse_loss_layer_024": 0.058838, "value_mse_loss_layer_025": 0.052979, "value_mse_loss_layer_026": 0.044922, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.09668, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000126, "vq_loss_layer_006": 0.00017, "vq_loss_layer_007": 0.000257, "vq_loss_layer_008": 0.000259, "vq_loss_layer_009": 0.000391, "vq_loss_layer_010": 0.000395, "vq_loss_layer_011": 0.000353, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.000542, "vq_loss_layer_014": 0.000641, "vq_loss_layer_015": 0.000679, "vq_loss_layer_016": 0.000542, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000259, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.000618, "vq_loss_layer_022": 0.00038, "vq_loss_layer_023": 0.000452, "vq_loss_layer_024": 0.000622, "vq_loss_layer_025": 0.000458, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.006134 }, { "ce_loss": 2.368159, "epoch": 0.00368, "grad_norm": 0.00457999249920249, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.056757, "kv_vq_loss": 0.000664, "learning_rate": 0.0008914619546683794, "loss": 0.05744, "step": 3680, "value_mse_loss_layer_000": 0.000805, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.009155, "value_mse_loss_layer_003": 0.014343, "value_mse_loss_layer_004": 0.014404, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.015503, "value_mse_loss_layer_007": 0.01709, "value_mse_loss_layer_008": 0.020142, "value_mse_loss_layer_009": 0.025269, "value_mse_loss_layer_010": 0.022461, "value_mse_loss_layer_011": 0.023315, "value_mse_loss_layer_012": 0.026367, "value_mse_loss_layer_013": 0.024536, "value_mse_loss_layer_014": 0.026245, "value_mse_loss_layer_015": 0.027954, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.02832, "value_mse_loss_layer_018": 0.027832, "value_mse_loss_layer_019": 0.03064, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.041016, "value_mse_loss_layer_024": 0.050049, "value_mse_loss_layer_025": 0.055176, "value_mse_loss_layer_026": 0.05127, "value_mse_loss_layer_027": 0.069336, "value_mse_loss_layer_028": 0.062012, "value_mse_loss_layer_029": 0.09375, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.10498, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 0.000133, "vq_loss_layer_005": 0.000123, "vq_loss_layer_006": 0.0002, "vq_loss_layer_007": 0.000269, "vq_loss_layer_008": 0.000288, "vq_loss_layer_009": 0.000338, "vq_loss_layer_010": 0.000317, "vq_loss_layer_011": 0.000353, "vq_loss_layer_012": 0.00066, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000557, "vq_loss_layer_017": 0.000633, "vq_loss_layer_018": 0.000401, "vq_loss_layer_019": 0.000273, "vq_loss_layer_020": 0.000343, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.000364, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000492, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000874, "vq_loss_layer_027": 0.001053, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001846, "vq_loss_layer_030": 0.003159, "vq_loss_layer_031": 0.007568 }, { "ce_loss": 2.326973, "epoch": 0.00369, "grad_norm": 0.0045951069332659245, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.056372, "kv_vq_loss": 0.000683, "learning_rate": 0.0008917565915397648, "loss": 0.057056, "step": 3690, "value_mse_loss_layer_000": 0.000744, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.00885, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.013184, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.015259, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.02124, "value_mse_loss_layer_011": 0.023193, "value_mse_loss_layer_012": 0.023682, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.025391, "value_mse_loss_layer_015": 0.028564, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.027466, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.030884, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.036377, "value_mse_loss_layer_023": 0.043701, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.052979, "value_mse_loss_layer_026": 0.046631, "value_mse_loss_layer_027": 0.060303, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.098633, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.095215, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000309, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000347, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000378, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000546, "vq_loss_layer_017": 0.000446, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.000227, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000376, "vq_loss_layer_023": 0.000437, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.006256 }, { "ce_loss": 2.301938, "epoch": 0.0037, "grad_norm": 0.00383046199567616, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.056711, "kv_vq_loss": 0.000658, "learning_rate": 0.0008920504310167487, "loss": 0.057364, "step": 3700, "value_mse_loss_layer_000": 0.000748, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.014099, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.013, "value_mse_loss_layer_006": 0.015564, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.020752, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.024902, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.02771, "value_mse_loss_layer_016": 0.024292, "value_mse_loss_layer_017": 0.027588, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.030518, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.041016, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.054199, "value_mse_loss_layer_026": 0.058594, "value_mse_loss_layer_027": 0.063965, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.094238, "value_mse_loss_layer_030": 0.085449, "value_mse_loss_layer_031": 0.098145, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000209, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.00028, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000496, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000416, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000595, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.00037, "vq_loss_layer_026": 0.000992, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001587, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.006042 }, { "ce_loss": 2.334641, "epoch": 0.00371, "grad_norm": 0.004996480420231819, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.111816, "key_mse_loss_layer_016": 0.104004, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.05668, "kv_vq_loss": 0.000666, "learning_rate": 0.0008923434774037614, "loss": 0.057339, "step": 3710, "value_mse_loss_layer_000": 0.000751, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008728, "value_mse_loss_layer_003": 0.014282, "value_mse_loss_layer_004": 0.014526, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.016968, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.025024, "value_mse_loss_layer_010": 0.021729, "value_mse_loss_layer_011": 0.023193, "value_mse_loss_layer_012": 0.023315, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.023193, "value_mse_loss_layer_017": 0.027954, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.029663, "value_mse_loss_layer_020": 0.031982, "value_mse_loss_layer_021": 0.041748, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.051758, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.059326, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.080566, "value_mse_loss_layer_031": 0.113281, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 0.000145, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000257, "vq_loss_layer_008": 0.000292, "vq_loss_layer_009": 0.000313, "vq_loss_layer_010": 0.000334, "vq_loss_layer_011": 0.00038, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000595, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.00053, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.000357, "vq_loss_layer_019": 0.000233, "vq_loss_layer_020": 0.000315, "vq_loss_layer_021": 0.000729, "vq_loss_layer_022": 0.0005, "vq_loss_layer_023": 0.000599, "vq_loss_layer_024": 0.000429, "vq_loss_layer_025": 0.000519, "vq_loss_layer_026": 0.000797, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.001198, "vq_loss_layer_029": 0.001762, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.008545 }, { "ce_loss": 2.271089, "epoch": 0.00372, "grad_norm": 0.004999148193746805, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.012817, "key_mse_loss_layer_002": 0.070312, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.068359, "key_mse_loss_layer_006": 0.080078, "key_mse_loss_layer_007": 0.083008, "key_mse_loss_layer_008": 0.094238, "key_mse_loss_layer_009": 0.102051, "key_mse_loss_layer_010": 0.114746, "key_mse_loss_layer_011": 0.112305, "key_mse_loss_layer_012": 0.084961, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.118652, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.091309, "key_mse_loss_layer_027": 0.092773, "key_mse_loss_layer_028": 0.09668, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.057193, "kv_vq_loss": 0.000692, "learning_rate": 0.0008926357349704743, "loss": 0.057895, "step": 3720, "value_mse_loss_layer_000": 0.000839, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.008789, "value_mse_loss_layer_003": 0.015076, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.013184, "value_mse_loss_layer_006": 0.014771, "value_mse_loss_layer_007": 0.01709, "value_mse_loss_layer_008": 0.020874, "value_mse_loss_layer_009": 0.024536, "value_mse_loss_layer_010": 0.022705, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.035156, "value_mse_loss_layer_013": 0.025635, "value_mse_loss_layer_014": 0.027832, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.022705, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.027588, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.049316, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.091309, "value_mse_loss_layer_031": 0.101074, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 3.1e-05, "vq_loss_layer_002": 4.8e-05, "vq_loss_layer_003": 5.5e-05, "vq_loss_layer_004": 0.000112, "vq_loss_layer_005": 0.00015, "vq_loss_layer_006": 0.000219, "vq_loss_layer_007": 0.000294, "vq_loss_layer_008": 0.00042, "vq_loss_layer_009": 0.000389, "vq_loss_layer_010": 0.000473, "vq_loss_layer_011": 0.000416, "vq_loss_layer_012": 0.001328, "vq_loss_layer_013": 0.00053, "vq_loss_layer_014": 0.000774, "vq_loss_layer_015": 0.000698, "vq_loss_layer_016": 0.000629, "vq_loss_layer_017": 0.000473, "vq_loss_layer_018": 0.000288, "vq_loss_layer_019": 0.000313, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000843, "vq_loss_layer_022": 0.000479, "vq_loss_layer_023": 0.000399, "vq_loss_layer_024": 0.000694, "vq_loss_layer_025": 0.000824, "vq_loss_layer_026": 0.000999, "vq_loss_layer_027": 0.001427, "vq_loss_layer_028": 0.001503, "vq_loss_layer_029": 0.002243, "vq_loss_layer_030": 0.0047, "vq_loss_layer_031": 0.010132 }, { "ce_loss": 2.327118, "epoch": 0.00373, "grad_norm": 0.004134017042815685, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.056671, "kv_vq_loss": 0.000652, "learning_rate": 0.0008929272079521718, "loss": 0.057327, "step": 3730, "value_mse_loss_layer_000": 0.000744, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.01355, "value_mse_loss_layer_004": 0.011963, "value_mse_loss_layer_005": 0.01355, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.025269, "value_mse_loss_layer_010": 0.022583, "value_mse_loss_layer_011": 0.023438, "value_mse_loss_layer_012": 0.02478, "value_mse_loss_layer_013": 0.02417, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.029053, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.026123, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.036865, "value_mse_loss_layer_022": 0.036377, "value_mse_loss_layer_023": 0.038574, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.055176, "value_mse_loss_layer_026": 0.061768, "value_mse_loss_layer_027": 0.058105, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.085449, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.091797, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 0.000123, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000237, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.000319, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000332, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000486, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000471, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.00024, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.00103, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.0009, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.304377, "epoch": 0.00374, "grad_norm": 0.005592238623648882, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.056912, "kv_vq_loss": 0.000684, "learning_rate": 0.00089321790055012, "loss": 0.057593, "step": 3740, "value_mse_loss_layer_000": 0.000759, "value_mse_loss_layer_001": 0.002167, "value_mse_loss_layer_002": 0.01001, "value_mse_loss_layer_003": 0.013794, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.014771, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.020508, "value_mse_loss_layer_009": 0.024536, "value_mse_loss_layer_010": 0.021362, "value_mse_loss_layer_011": 0.021973, "value_mse_loss_layer_012": 0.02356, "value_mse_loss_layer_013": 0.023926, "value_mse_loss_layer_014": 0.02832, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.024414, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.025879, "value_mse_loss_layer_019": 0.029419, "value_mse_loss_layer_020": 0.030884, "value_mse_loss_layer_021": 0.035156, "value_mse_loss_layer_022": 0.035156, "value_mse_loss_layer_023": 0.04834, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.055664, "value_mse_loss_layer_026": 0.049561, "value_mse_loss_layer_027": 0.06543, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.098633, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.103027, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000169, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000305, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.000484, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.00069, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000542, "vq_loss_layer_017": 0.000458, "vq_loss_layer_018": 0.000345, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.00038, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000805, "vq_loss_layer_027": 0.000931, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001892, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.007538 }, { "ce_loss": 2.333516, "epoch": 0.00375, "grad_norm": 0.003734483616426587, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.057108, "kv_vq_loss": 0.000659, "learning_rate": 0.0008935078169319296, "loss": 0.057773, "step": 3750, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.014526, "value_mse_loss_layer_004": 0.012268, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.015015, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.024658, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.022827, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026733, "value_mse_loss_layer_015": 0.028564, "value_mse_loss_layer_016": 0.025146, "value_mse_loss_layer_017": 0.029419, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.031494, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.040527, "value_mse_loss_layer_022": 0.040283, "value_mse_loss_layer_023": 0.047119, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.053223, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.060303, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.091797, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 8.6e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000184, "vq_loss_layer_007": 0.000278, "vq_loss_layer_008": 0.000224, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.000446, "vq_loss_layer_014": 0.000534, "vq_loss_layer_015": 0.000488, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000481, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000477, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000393, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.004791 }, { "ce_loss": 2.309132, "epoch": 0.00376, "grad_norm": 0.006839416455477476, "key_mse_loss_layer_000": 0.003937, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.056714, "kv_vq_loss": 0.000656, "learning_rate": 0.0008937969612319152, "loss": 0.05737, "step": 3760, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008545, "value_mse_loss_layer_003": 0.013611, "value_mse_loss_layer_004": 0.012512, "value_mse_loss_layer_005": 0.012817, "value_mse_loss_layer_006": 0.014587, "value_mse_loss_layer_007": 0.017578, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.027466, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.023315, "value_mse_loss_layer_012": 0.023682, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.026611, "value_mse_loss_layer_015": 0.028198, "value_mse_loss_layer_016": 0.023682, "value_mse_loss_layer_017": 0.02832, "value_mse_loss_layer_018": 0.026123, "value_mse_loss_layer_019": 0.029175, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.035156, "value_mse_loss_layer_023": 0.04126, "value_mse_loss_layer_024": 0.042969, "value_mse_loss_layer_025": 0.07959, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.083008, "value_mse_loss_layer_028": 0.061279, "value_mse_loss_layer_029": 0.091309, "value_mse_loss_layer_030": 0.106934, "value_mse_loss_layer_031": 0.097656, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000322, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000431, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000317, "vq_loss_layer_012": 0.000479, "vq_loss_layer_013": 0.000463, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000444, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000706, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.001465, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.001526, "vq_loss_layer_030": 0.004395, "vq_loss_layer_031": 0.006195 }, { "ce_loss": 2.299289, "epoch": 0.00377, "grad_norm": 0.003804387291893363, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.056665, "kv_vq_loss": 0.000676, "learning_rate": 0.000894085337551448, "loss": 0.057343, "step": 3770, "value_mse_loss_layer_000": 0.000748, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.009033, "value_mse_loss_layer_003": 0.01416, "value_mse_loss_layer_004": 0.013672, "value_mse_loss_layer_005": 0.013184, "value_mse_loss_layer_006": 0.016235, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.026245, "value_mse_loss_layer_010": 0.022827, "value_mse_loss_layer_011": 0.023193, "value_mse_loss_layer_012": 0.025513, "value_mse_loss_layer_013": 0.026245, "value_mse_loss_layer_014": 0.026367, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.025391, "value_mse_loss_layer_017": 0.029297, "value_mse_loss_layer_018": 0.026123, "value_mse_loss_layer_019": 0.034424, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.040283, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.049805, "value_mse_loss_layer_025": 0.066406, "value_mse_loss_layer_026": 0.052246, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.067383, "value_mse_loss_layer_029": 0.094727, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.10791, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.000213, "vq_loss_layer_007": 0.000233, "vq_loss_layer_008": 0.000273, "vq_loss_layer_009": 0.000341, "vq_loss_layer_010": 0.00032, "vq_loss_layer_011": 0.000332, "vq_loss_layer_012": 0.000595, "vq_loss_layer_013": 0.000463, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000618, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000437, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.000257, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000418, "vq_loss_layer_025": 0.000561, "vq_loss_layer_026": 0.000885, "vq_loss_layer_027": 0.000858, "vq_loss_layer_028": 0.001099, "vq_loss_layer_029": 0.001785, "vq_loss_layer_030": 0.003418, "vq_loss_layer_031": 0.007111 }, { "ce_loss": 2.30701, "epoch": 0.00378, "grad_norm": 0.0030123931355774403, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.056766, "kv_vq_loss": 0.000657, "learning_rate": 0.0008943729499593061, "loss": 0.057431, "step": 3780, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.014221, "value_mse_loss_layer_004": 0.01355, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.015198, "value_mse_loss_layer_007": 0.01709, "value_mse_loss_layer_008": 0.020264, "value_mse_loss_layer_009": 0.025024, "value_mse_loss_layer_010": 0.021362, "value_mse_loss_layer_011": 0.022461, "value_mse_loss_layer_012": 0.022827, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.026367, "value_mse_loss_layer_015": 0.027588, "value_mse_loss_layer_016": 0.024658, "value_mse_loss_layer_017": 0.02771, "value_mse_loss_layer_018": 0.029419, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.03125, "value_mse_loss_layer_021": 0.037354, "value_mse_loss_layer_022": 0.039307, "value_mse_loss_layer_023": 0.040771, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.055664, "value_mse_loss_layer_026": 0.048828, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.066895, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.087402, "value_mse_loss_layer_031": 0.098633, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000169, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000284, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000299, "vq_loss_layer_012": 0.000452, "vq_loss_layer_013": 0.000443, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000463, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000467, "vq_loss_layer_018": 0.000345, "vq_loss_layer_019": 0.000257, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000372, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.00036, "vq_loss_layer_025": 0.000446, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001816, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.006042 }, { "ce_loss": 2.306453, "epoch": 0.00379, "grad_norm": 0.0029843298252671957, "key_mse_loss_layer_000": 0.008118, "key_mse_loss_layer_001": 0.013977, "key_mse_loss_layer_002": 0.061523, "key_mse_loss_layer_003": 0.054688, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.11084, "key_mse_loss_layer_030": 0.104004, "key_mse_loss_layer_031": 0.095703, "kv_mse_loss": 0.056473, "kv_vq_loss": 0.000644, "learning_rate": 0.000894659802492018, "loss": 0.05708, "step": 3790, "value_mse_loss_layer_000": 0.000862, "value_mse_loss_layer_001": 0.002289, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.014404, "value_mse_loss_layer_004": 0.01355, "value_mse_loss_layer_005": 0.014038, "value_mse_loss_layer_006": 0.01532, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.028198, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.022461, "value_mse_loss_layer_012": 0.023438, "value_mse_loss_layer_013": 0.02478, "value_mse_loss_layer_014": 0.02771, "value_mse_loss_layer_015": 0.028076, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.029175, "value_mse_loss_layer_018": 0.028809, "value_mse_loss_layer_019": 0.033691, "value_mse_loss_layer_020": 0.036133, "value_mse_loss_layer_021": 0.038574, "value_mse_loss_layer_022": 0.039062, "value_mse_loss_layer_023": 0.044922, "value_mse_loss_layer_024": 0.051025, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.05542, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.066895, "value_mse_loss_layer_029": 0.116211, "value_mse_loss_layer_030": 0.106445, "value_mse_loss_layer_031": 0.112305, "vq_loss_layer_000": 1.3e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000189, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000267, "vq_loss_layer_009": 0.000471, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000319, "vq_loss_layer_012": 0.000477, "vq_loss_layer_013": 0.000408, "vq_loss_layer_014": 0.000614, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000565, "vq_loss_layer_017": 0.0005, "vq_loss_layer_018": 0.000322, "vq_loss_layer_019": 0.00033, "vq_loss_layer_020": 0.000341, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.00038, "vq_loss_layer_023": 0.000423, "vq_loss_layer_024": 0.000454, "vq_loss_layer_025": 0.00066, "vq_loss_layer_026": 0.00103, "vq_loss_layer_027": 0.00082, "vq_loss_layer_028": 0.001205, "vq_loss_layer_029": 0.002518, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.008667 }, { "ce_loss": 2.286619, "epoch": 0.0038, "grad_norm": 0.004713899921625853, "key_mse_loss_layer_000": 0.002655, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.056186, "kv_vq_loss": 0.000654, "learning_rate": 0.0008949458991542024, "loss": 0.056836, "step": 3800, "value_mse_loss_layer_000": 0.000706, "value_mse_loss_layer_001": 0.002243, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.013245, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.018188, "value_mse_loss_layer_007": 0.0177, "value_mse_loss_layer_008": 0.019653, "value_mse_loss_layer_009": 0.026489, "value_mse_loss_layer_010": 0.022827, "value_mse_loss_layer_011": 0.023071, "value_mse_loss_layer_012": 0.025269, "value_mse_loss_layer_013": 0.026733, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.026855, "value_mse_loss_layer_016": 0.022827, "value_mse_loss_layer_017": 0.027588, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.028931, "value_mse_loss_layer_020": 0.029785, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.030884, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.055176, "value_mse_loss_layer_026": 0.046631, "value_mse_loss_layer_027": 0.056885, "value_mse_loss_layer_028": 0.05835, "value_mse_loss_layer_029": 0.094238, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.099121, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 3e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 0.000125, "vq_loss_layer_006": 0.000376, "vq_loss_layer_007": 0.000282, "vq_loss_layer_008": 0.000355, "vq_loss_layer_009": 0.000372, "vq_loss_layer_010": 0.000391, "vq_loss_layer_011": 0.000366, "vq_loss_layer_012": 0.000637, "vq_loss_layer_013": 0.000475, "vq_loss_layer_014": 0.000603, "vq_loss_layer_015": 0.000645, "vq_loss_layer_016": 0.00058, "vq_loss_layer_017": 0.000469, "vq_loss_layer_018": 0.000435, "vq_loss_layer_019": 0.000378, "vq_loss_layer_020": 0.000374, "vq_loss_layer_021": 0.00074, "vq_loss_layer_022": 0.00042, "vq_loss_layer_023": 0.002106, "vq_loss_layer_024": 0.000839, "vq_loss_layer_025": 0.001671, "vq_loss_layer_026": 0.00103, "vq_loss_layer_027": 0.003174, "vq_loss_layer_028": 0.001442, "vq_loss_layer_029": 0.006439, "vq_loss_layer_030": 0.003586, "vq_loss_layer_031": 0.008545 }, { "ce_loss": 2.347169, "epoch": 0.00381, "grad_norm": 0.004120238590985537, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.056189, "kv_vq_loss": 0.000668, "learning_rate": 0.0008952312439189047, "loss": 0.056873, "step": 3810, "value_mse_loss_layer_000": 0.000755, "value_mse_loss_layer_001": 0.002167, "value_mse_loss_layer_002": 0.008545, "value_mse_loss_layer_003": 0.014771, "value_mse_loss_layer_004": 0.012756, "value_mse_loss_layer_005": 0.012329, "value_mse_loss_layer_006": 0.015137, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.02478, "value_mse_loss_layer_010": 0.021484, "value_mse_loss_layer_011": 0.022217, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.025879, "value_mse_loss_layer_015": 0.029785, "value_mse_loss_layer_016": 0.026978, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.025879, "value_mse_loss_layer_019": 0.030518, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.036377, "value_mse_loss_layer_023": 0.044189, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.064453, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.057617, "value_mse_loss_layer_028": 0.062012, "value_mse_loss_layer_029": 0.092285, "value_mse_loss_layer_030": 0.08252, "value_mse_loss_layer_031": 0.095215, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 0.00012, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000414, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.00058, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000239, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.00042, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000626, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000931, "vq_loss_layer_029": 0.001587, "vq_loss_layer_030": 0.003143, "vq_loss_layer_031": 0.00589 }, { "ce_loss": 2.345073, "epoch": 0.00382, "grad_norm": 0.0032447651028633118, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.056183, "kv_vq_loss": 0.000659, "learning_rate": 0.000895515840727927, "loss": 0.056839, "step": 3820, "value_mse_loss_layer_000": 0.00074, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008728, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.012451, "value_mse_loss_layer_005": 0.012329, "value_mse_loss_layer_006": 0.014832, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.019653, "value_mse_loss_layer_009": 0.026489, "value_mse_loss_layer_010": 0.021973, "value_mse_loss_layer_011": 0.022949, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.026001, "value_mse_loss_layer_015": 0.028687, "value_mse_loss_layer_016": 0.025513, "value_mse_loss_layer_017": 0.028687, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.029907, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.038086, "value_mse_loss_layer_022": 0.039062, "value_mse_loss_layer_023": 0.041992, "value_mse_loss_layer_024": 0.044434, "value_mse_loss_layer_025": 0.057373, "value_mse_loss_layer_026": 0.046143, "value_mse_loss_layer_027": 0.060059, "value_mse_loss_layer_028": 0.064941, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.094727, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000162, "vq_loss_layer_007": 0.000222, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.000319, "vq_loss_layer_011": 0.000299, "vq_loss_layer_012": 0.000465, "vq_loss_layer_013": 0.000372, "vq_loss_layer_014": 0.000479, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.000278, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.00045, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.00036, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.00037, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.302621, "epoch": 0.00383, "grad_norm": 0.004911948926746845, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.056268, "kv_vq_loss": 0.00065, "learning_rate": 0.0008957996934921555, "loss": 0.056891, "step": 3830, "value_mse_loss_layer_000": 0.000732, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008728, "value_mse_loss_layer_003": 0.014038, "value_mse_loss_layer_004": 0.013184, "value_mse_loss_layer_005": 0.012817, "value_mse_loss_layer_006": 0.014771, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.019409, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.021851, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.025513, "value_mse_loss_layer_014": 0.025024, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.027222, "value_mse_loss_layer_018": 0.027954, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.041016, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.049805, "value_mse_loss_layer_024": 0.047119, "value_mse_loss_layer_025": 0.066895, "value_mse_loss_layer_026": 0.054443, "value_mse_loss_layer_027": 0.081543, "value_mse_loss_layer_028": 0.070312, "value_mse_loss_layer_029": 0.105957, "value_mse_loss_layer_030": 0.102539, "value_mse_loss_layer_031": 0.105469, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000218, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.00033, "vq_loss_layer_010": 0.000244, "vq_loss_layer_011": 0.000322, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000452, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.000483, "vq_loss_layer_016": 0.00046, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000423, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000565, "vq_loss_layer_026": 0.000744, "vq_loss_layer_027": 0.001099, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.002136, "vq_loss_layer_030": 0.003189, "vq_loss_layer_031": 0.006836 }, { "ce_loss": 2.332757, "epoch": 0.00384, "grad_norm": 0.00416441960260272, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.056348, "kv_vq_loss": 0.000644, "learning_rate": 0.0008960828060918826, "loss": 0.056985, "step": 3840, "value_mse_loss_layer_000": 0.000755, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.013367, "value_mse_loss_layer_004": 0.011963, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.014771, "value_mse_loss_layer_007": 0.016113, "value_mse_loss_layer_008": 0.019409, "value_mse_loss_layer_009": 0.024536, "value_mse_loss_layer_010": 0.021362, "value_mse_loss_layer_011": 0.022095, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.027954, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.028442, "value_mse_loss_layer_018": 0.026611, "value_mse_loss_layer_019": 0.03064, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.045166, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.06543, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.09375, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000161, "vq_loss_layer_007": 0.000256, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000469, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000486, "vq_loss_layer_015": 0.000481, "vq_loss_layer_016": 0.000467, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000231, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.001099, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.003159, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.29191, "epoch": 0.00385, "grad_norm": 0.0041472697630524635, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.056458, "kv_vq_loss": 0.000646, "learning_rate": 0.0008963651823771252, "loss": 0.057089, "step": 3850, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008545, "value_mse_loss_layer_003": 0.016113, "value_mse_loss_layer_004": 0.012207, "value_mse_loss_layer_005": 0.012146, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.016479, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.022583, "value_mse_loss_layer_012": 0.029785, "value_mse_loss_layer_013": 0.024902, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.028809, "value_mse_loss_layer_018": 0.025391, "value_mse_loss_layer_019": 0.032471, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.049805, "value_mse_loss_layer_022": 0.038818, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.044189, "value_mse_loss_layer_025": 0.053467, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.057129, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.085938, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.092773, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000235, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000338, "vq_loss_layer_010": 0.000263, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.00095, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000462, "vq_loss_layer_016": 0.000519, "vq_loss_layer_017": 0.000446, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000219, "vq_loss_layer_020": 0.000242, "vq_loss_layer_021": 0.000652, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000303, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001221, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.320468, "epoch": 0.00386, "grad_norm": 0.006020748522132635, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.056241, "kv_vq_loss": 0.000624, "learning_rate": 0.0008966468261679386, "loss": 0.056842, "step": 3860, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008789, "value_mse_loss_layer_003": 0.015076, "value_mse_loss_layer_004": 0.012939, "value_mse_loss_layer_005": 0.012451, "value_mse_loss_layer_006": 0.014587, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.023438, "value_mse_loss_layer_012": 0.025024, "value_mse_loss_layer_013": 0.025879, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.028442, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.029053, "value_mse_loss_layer_018": 0.026611, "value_mse_loss_layer_019": 0.030151, "value_mse_loss_layer_020": 0.030884, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.042236, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.044922, "value_mse_loss_layer_027": 0.062256, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.12207, "value_mse_loss_layer_030": 0.078613, "value_mse_loss_layer_031": 0.092773, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000231, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000301, "vq_loss_layer_010": 0.000307, "vq_loss_layer_011": 0.000408, "vq_loss_layer_012": 0.000626, "vq_loss_layer_013": 0.000446, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000496, "vq_loss_layer_017": 0.000553, "vq_loss_layer_018": 0.000301, "vq_loss_layer_019": 0.00025, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000622, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000557, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.00058, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.000885, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.00209, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.006622 }, { "ce_loss": 2.288191, "epoch": 0.00387, "grad_norm": 0.0036448275204747915, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.056241, "kv_vq_loss": 0.000648, "learning_rate": 0.0008969277412547277, "loss": 0.056882, "step": 3870, "value_mse_loss_layer_000": 0.000759, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.012146, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.015137, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.020142, "value_mse_loss_layer_009": 0.026733, "value_mse_loss_layer_010": 0.024658, "value_mse_loss_layer_011": 0.022461, "value_mse_loss_layer_012": 0.023438, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.02832, "value_mse_loss_layer_015": 0.028931, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.028076, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.031006, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.038086, "value_mse_loss_layer_022": 0.035645, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.044189, "value_mse_loss_layer_025": 0.056152, "value_mse_loss_layer_026": 0.046143, "value_mse_loss_layer_027": 0.0625, "value_mse_loss_layer_028": 0.067871, "value_mse_loss_layer_029": 0.093262, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.09082, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000188, "vq_loss_layer_007": 0.000234, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000429, "vq_loss_layer_010": 0.000315, "vq_loss_layer_011": 0.000294, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.000381, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000404, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000215, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.000486, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001465, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.298592, "epoch": 0.00388, "grad_norm": 0.0035268457140773535, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.056058, "kv_vq_loss": 0.000636, "learning_rate": 0.0008972079313985516, "loss": 0.056677, "step": 3880, "value_mse_loss_layer_000": 0.000759, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008789, "value_mse_loss_layer_003": 0.013855, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.012512, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.021606, "value_mse_loss_layer_011": 0.023926, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.027588, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.028687, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.029907, "value_mse_loss_layer_020": 0.0354, "value_mse_loss_layer_021": 0.036865, "value_mse_loss_layer_022": 0.037354, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.042236, "value_mse_loss_layer_025": 0.055664, "value_mse_loss_layer_026": 0.048096, "value_mse_loss_layer_027": 0.057617, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000374, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.00038, "vq_loss_layer_012": 0.000456, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000236, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000366, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.005859 }, { "ce_loss": 2.249956, "epoch": 0.00389, "grad_norm": 0.0035191322676837444, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.056085, "kv_vq_loss": 0.00065, "learning_rate": 0.0008974874003314268, "loss": 0.05672, "step": 3890, "value_mse_loss_layer_000": 0.000721, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.00885, "value_mse_loss_layer_003": 0.015198, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.012573, "value_mse_loss_layer_006": 0.014282, "value_mse_loss_layer_007": 0.017334, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.025269, "value_mse_loss_layer_010": 0.021606, "value_mse_loss_layer_011": 0.023193, "value_mse_loss_layer_012": 0.026123, "value_mse_loss_layer_013": 0.026367, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.024658, "value_mse_loss_layer_017": 0.028564, "value_mse_loss_layer_018": 0.028076, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.035156, "value_mse_loss_layer_021": 0.037354, "value_mse_loss_layer_022": 0.03833, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.056152, "value_mse_loss_layer_026": 0.049805, "value_mse_loss_layer_027": 0.0625, "value_mse_loss_layer_028": 0.070801, "value_mse_loss_layer_029": 0.09668, "value_mse_loss_layer_030": 0.088867, "value_mse_loss_layer_031": 0.097168, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.00029, "vq_loss_layer_008": 0.000244, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000549, "vq_loss_layer_013": 0.000427, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000496, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000469, "vq_loss_layer_018": 0.000435, "vq_loss_layer_019": 0.000332, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000423, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000561, "vq_loss_layer_024": 0.000429, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000977, "vq_loss_layer_029": 0.002029, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.259798, "epoch": 0.0039, "grad_norm": 0.0053539518266916275, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.056757, "kv_vq_loss": 0.000655, "learning_rate": 0.0008977661517566246, "loss": 0.057413, "step": 3900, "value_mse_loss_layer_000": 0.000797, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.013489, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.027466, "value_mse_loss_layer_010": 0.021484, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.023315, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.028442, "value_mse_loss_layer_016": 0.025024, "value_mse_loss_layer_017": 0.027954, "value_mse_loss_layer_018": 0.026245, "value_mse_loss_layer_019": 0.028931, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.035645, "value_mse_loss_layer_023": 0.043457, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.060059, "value_mse_loss_layer_026": 0.044189, "value_mse_loss_layer_027": 0.066406, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.111816, "value_mse_loss_layer_030": 0.079102, "value_mse_loss_layer_031": 0.089355, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 0.000138, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000181, "vq_loss_layer_007": 0.000248, "vq_loss_layer_008": 0.000222, "vq_loss_layer_009": 0.000473, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000271, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000471, "vq_loss_layer_017": 0.000418, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000385, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000877, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.004974 }, { "ce_loss": 2.255571, "epoch": 0.00391, "grad_norm": 0.0036293864250183105, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.056067, "kv_vq_loss": 0.000659, "learning_rate": 0.0008980441893489666, "loss": 0.056738, "step": 3910, "value_mse_loss_layer_000": 0.000748, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.013489, "value_mse_loss_layer_004": 0.012634, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.025024, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.022949, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025635, "value_mse_loss_layer_014": 0.027832, "value_mse_loss_layer_015": 0.028564, "value_mse_loss_layer_016": 0.023682, "value_mse_loss_layer_017": 0.027466, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.033447, "value_mse_loss_layer_020": 0.030396, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.038574, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.052002, "value_mse_loss_layer_026": 0.052246, "value_mse_loss_layer_027": 0.059326, "value_mse_loss_layer_028": 0.059326, "value_mse_loss_layer_029": 0.087402, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000242, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000319, "vq_loss_layer_010": 0.000317, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000479, "vq_loss_layer_014": 0.000614, "vq_loss_layer_015": 0.000572, "vq_loss_layer_016": 0.000488, "vq_loss_layer_017": 0.000444, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000237, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000422, "vq_loss_layer_023": 0.000427, "vq_loss_layer_024": 0.000376, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000862, "vq_loss_layer_027": 0.000732, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.005859 }, { "ce_loss": 2.340337, "epoch": 0.00392, "grad_norm": 0.003215084783732891, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.136719, "key_mse_loss_layer_014": 0.131836, "key_mse_loss_layer_015": 0.117676, "key_mse_loss_layer_016": 0.113281, "key_mse_loss_layer_017": 0.112793, "key_mse_loss_layer_018": 0.120605, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.109863, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.10791, "key_mse_loss_layer_023": 0.103516, "key_mse_loss_layer_024": 0.080566, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.089844, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.055963, "kv_vq_loss": 0.000633, "learning_rate": 0.0008983215167551142, "loss": 0.056583, "step": 3920, "value_mse_loss_layer_000": 0.000767, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.013428, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.014526, "value_mse_loss_layer_007": 0.016113, "value_mse_loss_layer_008": 0.019409, "value_mse_loss_layer_009": 0.024414, "value_mse_loss_layer_010": 0.021362, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.026733, "value_mse_loss_layer_015": 0.027344, "value_mse_loss_layer_016": 0.02417, "value_mse_loss_layer_017": 0.027954, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.029785, "value_mse_loss_layer_020": 0.034912, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.034668, "value_mse_loss_layer_023": 0.03833, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.078125, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.098633, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.7e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 9.4e-05, "vq_loss_layer_005": 0.000106, "vq_loss_layer_006": 0.000174, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.00034, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000324, "vq_loss_layer_011": 0.000362, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000446, "vq_loss_layer_014": 0.000679, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000492, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000241, "vq_loss_layer_020": 0.000359, "vq_loss_layer_021": 0.000618, "vq_loss_layer_022": 0.000458, "vq_loss_layer_023": 0.000618, "vq_loss_layer_024": 0.000423, "vq_loss_layer_025": 0.000717, "vq_loss_layer_026": 0.000801, "vq_loss_layer_027": 0.000778, "vq_loss_layer_028": 0.001396, "vq_loss_layer_029": 0.001465, "vq_loss_layer_030": 0.003098, "vq_loss_layer_031": 0.00824 }, { "ce_loss": 2.28676, "epoch": 0.00393, "grad_norm": 0.0041482034139335155, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.056061, "kv_vq_loss": 0.000639, "learning_rate": 0.0008985981375938566, "loss": 0.056699, "step": 3930, "value_mse_loss_layer_000": 0.000717, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.008545, "value_mse_loss_layer_003": 0.013428, "value_mse_loss_layer_004": 0.013062, "value_mse_loss_layer_005": 0.012817, "value_mse_loss_layer_006": 0.014771, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.019897, "value_mse_loss_layer_009": 0.026245, "value_mse_loss_layer_010": 0.021973, "value_mse_loss_layer_011": 0.023682, "value_mse_loss_layer_012": 0.026367, "value_mse_loss_layer_013": 0.025513, "value_mse_loss_layer_014": 0.027344, "value_mse_loss_layer_015": 0.028809, "value_mse_loss_layer_016": 0.024658, "value_mse_loss_layer_017": 0.028076, "value_mse_loss_layer_018": 0.025757, "value_mse_loss_layer_019": 0.030151, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.038086, "value_mse_loss_layer_022": 0.036133, "value_mse_loss_layer_023": 0.040039, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.071777, "value_mse_loss_layer_029": 0.089844, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.098633, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.000278, "vq_loss_layer_009": 0.000355, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000347, "vq_loss_layer_012": 0.000648, "vq_loss_layer_013": 0.0005, "vq_loss_layer_014": 0.000568, "vq_loss_layer_015": 0.000576, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000519, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.00025, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000584, "vq_loss_layer_022": 0.000412, "vq_loss_layer_023": 0.000456, "vq_loss_layer_024": 0.000469, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000748, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.00161, "vq_loss_layer_029": 0.001564, "vq_loss_layer_030": 0.003738, "vq_loss_layer_031": 0.006714 }, { "ce_loss": 2.315205, "epoch": 0.00394, "grad_norm": 0.0033112962264567614, "key_mse_loss_layer_000": 0.005035, "key_mse_loss_layer_001": 0.011841, "key_mse_loss_layer_002": 0.065918, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.092773, "key_mse_loss_layer_009": 0.09668, "key_mse_loss_layer_010": 0.110352, "key_mse_loss_layer_011": 0.108398, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.115723, "key_mse_loss_layer_016": 0.110352, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.119141, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.107422, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.106445, "key_mse_loss_layer_023": 0.104004, "key_mse_loss_layer_024": 0.085938, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.097656, "key_mse_loss_layer_027": 0.098145, "key_mse_loss_layer_028": 0.103027, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.106445, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.056171, "kv_vq_loss": 0.000647, "learning_rate": 0.0008988740554563934, "loss": 0.056818, "step": 3940, "value_mse_loss_layer_000": 0.000793, "value_mse_loss_layer_001": 0.002243, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.013062, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.014526, "value_mse_loss_layer_007": 0.016479, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.023926, "value_mse_loss_layer_010": 0.02124, "value_mse_loss_layer_011": 0.022095, "value_mse_loss_layer_012": 0.024292, "value_mse_loss_layer_013": 0.024536, "value_mse_loss_layer_014": 0.027344, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.024536, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.028442, "value_mse_loss_layer_020": 0.029541, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.058105, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.100586, "vq_loss_layer_000": 1.1e-05, "vq_loss_layer_001": 2.7e-05, "vq_loss_layer_002": 3.5e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 9.6e-05, "vq_loss_layer_005": 0.000132, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.000273, "vq_loss_layer_008": 0.000326, "vq_loss_layer_009": 0.000355, "vq_loss_layer_010": 0.000366, "vq_loss_layer_011": 0.000328, "vq_loss_layer_012": 0.00069, "vq_loss_layer_013": 0.000664, "vq_loss_layer_014": 0.000645, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000786, "vq_loss_layer_017": 0.000393, "vq_loss_layer_018": 0.00033, "vq_loss_layer_019": 0.000259, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.00038, "vq_loss_layer_024": 0.000427, "vq_loss_layer_025": 0.000816, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.001152, "vq_loss_layer_028": 0.00145, "vq_loss_layer_029": 0.00206, "vq_loss_layer_030": 0.00383, "vq_loss_layer_031": 0.00885 }, { "ce_loss": 2.298854, "epoch": 0.00395, "grad_norm": 0.005017372779548168, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.056024, "kv_vq_loss": 0.000638, "learning_rate": 0.0008991492739066149, "loss": 0.056665, "step": 3950, "value_mse_loss_layer_000": 0.000748, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.013, "value_mse_loss_layer_004": 0.012024, "value_mse_loss_layer_005": 0.012512, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.019775, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.022705, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.028198, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.029663, "value_mse_loss_layer_018": 0.026001, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.04248, "value_mse_loss_layer_022": 0.037354, "value_mse_loss_layer_023": 0.041504, "value_mse_loss_layer_024": 0.04248, "value_mse_loss_layer_025": 0.053955, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.057861, "value_mse_loss_layer_028": 0.061279, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.08252, "value_mse_loss_layer_031": 0.107422, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000171, "vq_loss_layer_007": 0.000227, "vq_loss_layer_008": 0.000273, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000534, "vq_loss_layer_013": 0.00053, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000473, "vq_loss_layer_017": 0.000546, "vq_loss_layer_018": 0.00029, "vq_loss_layer_019": 0.000235, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.000576, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.001244, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.006653 }, { "ce_loss": 2.244726, "epoch": 0.00396, "grad_norm": 0.003947092220187187, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.056268, "kv_vq_loss": 0.000666, "learning_rate": 0.0008994237964813779, "loss": 0.056915, "step": 3960, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.01355, "value_mse_loss_layer_004": 0.013855, "value_mse_loss_layer_005": 0.011658, "value_mse_loss_layer_006": 0.014282, "value_mse_loss_layer_007": 0.017456, "value_mse_loss_layer_008": 0.018921, "value_mse_loss_layer_009": 0.025146, "value_mse_loss_layer_010": 0.021484, "value_mse_loss_layer_011": 0.022583, "value_mse_loss_layer_012": 0.022827, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.026367, "value_mse_loss_layer_015": 0.028687, "value_mse_loss_layer_016": 0.024536, "value_mse_loss_layer_017": 0.028687, "value_mse_loss_layer_018": 0.025635, "value_mse_loss_layer_019": 0.030518, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.05249, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.062012, "value_mse_loss_layer_029": 0.083496, "value_mse_loss_layer_030": 0.078613, "value_mse_loss_layer_031": 0.088379, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 0.000132, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000151, "vq_loss_layer_007": 0.000313, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.00046, "vq_loss_layer_013": 0.000362, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000465, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.00045, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000549, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.004944 }, { "ce_loss": 2.316813, "epoch": 0.00397, "grad_norm": 0.004129042383283377, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.055994, "kv_vq_loss": 0.000631, "learning_rate": 0.0008996976266907786, "loss": 0.056607, "step": 3970, "value_mse_loss_layer_000": 0.000755, "value_mse_loss_layer_001": 0.002213, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.013123, "value_mse_loss_layer_005": 0.012451, "value_mse_loss_layer_006": 0.014526, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.025146, "value_mse_loss_layer_010": 0.020996, "value_mse_loss_layer_011": 0.022705, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.025879, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.023926, "value_mse_loss_layer_017": 0.02771, "value_mse_loss_layer_018": 0.02832, "value_mse_loss_layer_019": 0.029541, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.036133, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.041504, "value_mse_loss_layer_024": 0.042725, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.060059, "value_mse_loss_layer_029": 0.098145, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000227, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.000244, "vq_loss_layer_011": 0.000322, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000393, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000239, "vq_loss_layer_021": 0.000435, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.001404, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.005585 }, { "ce_loss": 2.304903, "epoch": 0.00398, "grad_norm": 0.005311733577400446, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.056082, "kv_vq_loss": 0.000645, "learning_rate": 0.0008999707680184218, "loss": 0.056717, "step": 3980, "value_mse_loss_layer_000": 0.000717, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.014038, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.015747, "value_mse_loss_layer_007": 0.016113, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.025146, "value_mse_loss_layer_010": 0.020996, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.024536, "value_mse_loss_layer_014": 0.031494, "value_mse_loss_layer_015": 0.028687, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.030884, "value_mse_loss_layer_018": 0.02771, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.030884, "value_mse_loss_layer_021": 0.036865, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.040527, "value_mse_loss_layer_024": 0.062988, "value_mse_loss_layer_025": 0.056641, "value_mse_loss_layer_026": 0.048096, "value_mse_loss_layer_027": 0.062988, "value_mse_loss_layer_028": 0.067383, "value_mse_loss_layer_029": 0.10498, "value_mse_loss_layer_030": 0.105957, "value_mse_loss_layer_031": 0.098145, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000218, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000299, "vq_loss_layer_011": 0.000326, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000412, "vq_loss_layer_014": 0.000771, "vq_loss_layer_015": 0.000614, "vq_loss_layer_016": 0.000515, "vq_loss_layer_017": 0.000854, "vq_loss_layer_018": 0.000412, "vq_loss_layer_019": 0.00029, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000568, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000425, "vq_loss_layer_024": 0.000793, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.000805, "vq_loss_layer_027": 0.0009, "vq_loss_layer_028": 0.001335, "vq_loss_layer_029": 0.002457, "vq_loss_layer_030": 0.003479, "vq_loss_layer_031": 0.007416 }, { "ce_loss": 2.265787, "epoch": 0.00399, "grad_norm": 0.004040027502924204, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.05654, "kv_vq_loss": 0.000647, "learning_rate": 0.000900243223921687, "loss": 0.057193, "step": 3990, "value_mse_loss_layer_000": 0.000721, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008606, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.011963, "value_mse_loss_layer_005": 0.011658, "value_mse_loss_layer_006": 0.014404, "value_mse_loss_layer_007": 0.016724, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.022461, "value_mse_loss_layer_011": 0.022583, "value_mse_loss_layer_012": 0.024048, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.024048, "value_mse_loss_layer_017": 0.028687, "value_mse_loss_layer_018": 0.025024, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.040527, "value_mse_loss_layer_022": 0.041992, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.04834, "value_mse_loss_layer_025": 0.05127, "value_mse_loss_layer_026": 0.049072, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.093262, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000286, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.00033, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000402, "vq_loss_layer_014": 0.000486, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000469, "vq_loss_layer_017": 0.000515, "vq_loss_layer_018": 0.000234, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000572, "vq_loss_layer_022": 0.000406, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000414, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000828, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.00148, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.005646 }, { "ce_loss": 2.330357, "epoch": 0.004, "grad_norm": 0.0034954589791595936, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.041504, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.139648, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.120605, "key_mse_loss_layer_016": 0.114258, "key_mse_loss_layer_017": 0.112793, "key_mse_loss_layer_018": 0.121094, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.101562, "key_mse_loss_layer_022": 0.106445, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.056146, "kv_vq_loss": 0.000617, "learning_rate": 0.0009005149978319905, "loss": 0.056723, "step": 4000, "value_mse_loss_layer_000": 0.000755, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.008728, "value_mse_loss_layer_003": 0.014465, "value_mse_loss_layer_004": 0.013611, "value_mse_loss_layer_005": 0.013062, "value_mse_loss_layer_006": 0.014587, "value_mse_loss_layer_007": 0.016479, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.024658, "value_mse_loss_layer_010": 0.020996, "value_mse_loss_layer_011": 0.021851, "value_mse_loss_layer_012": 0.022583, "value_mse_loss_layer_013": 0.025391, "value_mse_loss_layer_014": 0.025146, "value_mse_loss_layer_015": 0.026001, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.023804, "value_mse_loss_layer_019": 0.0271, "value_mse_loss_layer_020": 0.029663, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.057617, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.052246, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.091309, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 0.000131, "vq_loss_layer_005": 0.000135, "vq_loss_layer_006": 0.000183, "vq_loss_layer_007": 0.000257, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.000332, "vq_loss_layer_010": 0.000334, "vq_loss_layer_011": 0.000313, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000439, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000603, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000286, "vq_loss_layer_021": 0.000584, "vq_loss_layer_022": 0.000385, "vq_loss_layer_023": 0.000404, "vq_loss_layer_024": 0.000465, "vq_loss_layer_025": 0.000797, "vq_loss_layer_026": 0.00071, "vq_loss_layer_027": 0.000851, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.001396, "vq_loss_layer_030": 0.003311, "vq_loss_layer_031": 0.00766 }, { "ce_loss": 2.322964, "epoch": 0.00401, "grad_norm": 0.004227401223033667, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.054932, "key_mse_loss_layer_006": 0.061279, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.058838, "kv_mse_loss": 0.05614, "kv_vq_loss": 0.000625, "learning_rate": 0.0009007860931550454, "loss": 0.05676, "step": 4010, "value_mse_loss_layer_000": 0.00074, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.009216, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.012329, "value_mse_loss_layer_006": 0.014709, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.021362, "value_mse_loss_layer_011": 0.022827, "value_mse_loss_layer_012": 0.022705, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.028076, "value_mse_loss_layer_015": 0.028076, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.032227, "value_mse_loss_layer_018": 0.030518, "value_mse_loss_layer_019": 0.030273, "value_mse_loss_layer_020": 0.030029, "value_mse_loss_layer_021": 0.04126, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.047852, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.05127, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.093262, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4.8e-05, "vq_loss_layer_004": 8.6e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000182, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000278, "vq_loss_layer_009": 0.000364, "vq_loss_layer_010": 0.000332, "vq_loss_layer_011": 0.000387, "vq_loss_layer_012": 0.000496, "vq_loss_layer_013": 0.000439, "vq_loss_layer_014": 0.000736, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000546, "vq_loss_layer_017": 0.000767, "vq_loss_layer_018": 0.000389, "vq_loss_layer_019": 0.000261, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000797, "vq_loss_layer_022": 0.000351, "vq_loss_layer_023": 0.000668, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.001633, "vq_loss_layer_029": 0.00161, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.007172 }, { "ce_loss": 2.320974, "epoch": 0.00402, "grad_norm": 0.004623678512871265, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.085938, "key_mse_loss_layer_023": 0.083008, "key_mse_loss_layer_024": 0.064453, "key_mse_loss_layer_025": 0.064453, "key_mse_loss_layer_026": 0.072754, "key_mse_loss_layer_027": 0.071289, "key_mse_loss_layer_028": 0.079102, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.056091, "kv_vq_loss": 0.000643, "learning_rate": 0.0009010565132711174, "loss": 0.056735, "step": 4020, "value_mse_loss_layer_000": 0.000748, "value_mse_loss_layer_001": 0.002167, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.014893, "value_mse_loss_layer_007": 0.016479, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.021362, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.025391, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.025513, "value_mse_loss_layer_015": 0.030151, "value_mse_loss_layer_016": 0.023682, "value_mse_loss_layer_017": 0.030396, "value_mse_loss_layer_018": 0.024658, "value_mse_loss_layer_019": 0.029175, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.039551, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.052979, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.000259, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000599, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.000448, "vq_loss_layer_015": 0.000698, "vq_loss_layer_016": 0.000441, "vq_loss_layer_017": 0.000641, "vq_loss_layer_018": 0.000239, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000462, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.00042, "vq_loss_layer_026": 0.000759, "vq_loss_layer_027": 0.000904, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.005219 }, { "ce_loss": 2.308252, "epoch": 0.00403, "grad_norm": 0.00404768530279398, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.055804, "kv_vq_loss": 0.000637, "learning_rate": 0.0009013262615352772, "loss": 0.056445, "step": 4030, "value_mse_loss_layer_000": 0.000725, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.013062, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.014282, "value_mse_loss_layer_007": 0.015442, "value_mse_loss_layer_008": 0.02063, "value_mse_loss_layer_009": 0.024414, "value_mse_loss_layer_010": 0.020508, "value_mse_loss_layer_011": 0.022095, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.028687, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.027954, "value_mse_loss_layer_018": 0.026245, "value_mse_loss_layer_019": 0.029175, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.037354, "value_mse_loss_layer_023": 0.040527, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.053467, "value_mse_loss_layer_026": 0.049072, "value_mse_loss_layer_027": 0.062256, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.092285, "value_mse_loss_layer_030": 0.080078, "value_mse_loss_layer_031": 0.098145, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000151, "vq_loss_layer_007": 0.000217, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000238, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.00046, "vq_loss_layer_013": 0.000408, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000488, "vq_loss_layer_017": 0.000399, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.000205, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.00045, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000301, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.006134 }, { "ce_loss": 2.290458, "epoch": 0.00404, "grad_norm": 0.004408067092299461, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.055835, "kv_vq_loss": 0.000622, "learning_rate": 0.0009015953412776511, "loss": 0.056433, "step": 4040, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.013245, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.015381, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.025024, "value_mse_loss_layer_010": 0.02124, "value_mse_loss_layer_011": 0.021851, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.023682, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.028076, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.026978, "value_mse_loss_layer_018": 0.025269, "value_mse_loss_layer_019": 0.030151, "value_mse_loss_layer_020": 0.03125, "value_mse_loss_layer_021": 0.039551, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.04248, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.058105, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.061768, "value_mse_loss_layer_028": 0.077637, "value_mse_loss_layer_029": 0.089844, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.09082, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.0002, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000226, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000483, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000481, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000538, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000359, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.001427, "vq_loss_layer_029": 0.001366, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.005737 }, { "ce_loss": 2.29058, "epoch": 0.00405, "grad_norm": 0.0065770684741437435, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.041504, "key_mse_loss_layer_005": 0.05542, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.078613, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.092773, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.055997, "kv_vq_loss": 0.000642, "learning_rate": 0.000901863755803667, "loss": 0.056631, "step": 4050, "value_mse_loss_layer_000": 0.000786, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.00885, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.013123, "value_mse_loss_layer_005": 0.012451, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.02417, "value_mse_loss_layer_010": 0.020386, "value_mse_loss_layer_011": 0.02124, "value_mse_loss_layer_012": 0.021851, "value_mse_loss_layer_013": 0.023193, "value_mse_loss_layer_014": 0.024536, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.029053, "value_mse_loss_layer_019": 0.028809, "value_mse_loss_layer_020": 0.030762, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.035156, "value_mse_loss_layer_023": 0.040039, "value_mse_loss_layer_024": 0.053711, "value_mse_loss_layer_025": 0.0625, "value_mse_loss_layer_026": 0.05249, "value_mse_loss_layer_027": 0.062012, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.118164, "value_mse_loss_layer_030": 0.083496, "value_mse_loss_layer_031": 0.092285, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000218, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000414, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000437, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000475, "vq_loss_layer_017": 0.000429, "vq_loss_layer_018": 0.000338, "vq_loss_layer_019": 0.00025, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000469, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000511, "vq_loss_layer_025": 0.000584, "vq_loss_layer_026": 0.000851, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.001099, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.006042 }, { "ce_loss": 2.240989, "epoch": 0.00406, "grad_norm": 0.003329987870529294, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.083008, "kv_mse_loss": 0.056653, "kv_vq_loss": 0.000646, "learning_rate": 0.0009021315083942985, "loss": 0.057266, "step": 4060, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002167, "value_mse_loss_layer_002": 0.00946, "value_mse_loss_layer_003": 0.014343, "value_mse_loss_layer_004": 0.012756, "value_mse_loss_layer_005": 0.013367, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.019409, "value_mse_loss_layer_009": 0.024414, "value_mse_loss_layer_010": 0.021484, "value_mse_loss_layer_011": 0.022461, "value_mse_loss_layer_012": 0.023438, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.027954, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.028931, "value_mse_loss_layer_018": 0.027954, "value_mse_loss_layer_019": 0.030884, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.041504, "value_mse_loss_layer_022": 0.037842, "value_mse_loss_layer_023": 0.042236, "value_mse_loss_layer_024": 0.049805, "value_mse_loss_layer_025": 0.054932, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.085449, "value_mse_loss_layer_031": 0.096191, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000109, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000254, "vq_loss_layer_008": 0.000238, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000322, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000406, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000477, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000267, "vq_loss_layer_021": 0.000515, "vq_loss_layer_022": 0.000343, "vq_loss_layer_023": 0.000402, "vq_loss_layer_024": 0.000433, "vq_loss_layer_025": 0.00053, "vq_loss_layer_026": 0.000767, "vq_loss_layer_027": 0.000984, "vq_loss_layer_028": 0.00145, "vq_loss_layer_029": 0.003159, "vq_loss_layer_030": 0.003815, "vq_loss_layer_031": 0.009583 }, { "ce_loss": 2.279835, "epoch": 0.00407, "grad_norm": 0.0026294903364032507, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.0561, "kv_vq_loss": 0.000644, "learning_rate": 0.000902398602306305, "loss": 0.05672, "step": 4070, "value_mse_loss_layer_000": 0.000725, "value_mse_loss_layer_001": 0.002167, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.013367, "value_mse_loss_layer_004": 0.01239, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.014954, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.019165, "value_mse_loss_layer_009": 0.024414, "value_mse_loss_layer_010": 0.021606, "value_mse_loss_layer_011": 0.022217, "value_mse_loss_layer_012": 0.023071, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.025757, "value_mse_loss_layer_015": 0.029907, "value_mse_loss_layer_016": 0.024292, "value_mse_loss_layer_017": 0.02771, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.029663, "value_mse_loss_layer_020": 0.03064, "value_mse_loss_layer_021": 0.039062, "value_mse_loss_layer_022": 0.0354, "value_mse_loss_layer_023": 0.041992, "value_mse_loss_layer_024": 0.044922, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.046387, "value_mse_loss_layer_027": 0.057861, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.080566, "value_mse_loss_layer_031": 0.09082, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 0.000108, "vq_loss_layer_006": 0.000187, "vq_loss_layer_007": 0.000235, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000305, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000406, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000694, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000238, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000452, "vq_loss_layer_024": 0.000395, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.001457, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.005798 }, { "ce_loss": 2.252128, "epoch": 0.00408, "grad_norm": 0.003281095763668418, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.056232, "kv_vq_loss": 0.000642, "learning_rate": 0.0009026650407724699, "loss": 0.056854, "step": 4080, "value_mse_loss_layer_000": 0.00074, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008728, "value_mse_loss_layer_003": 0.014221, "value_mse_loss_layer_004": 0.013855, "value_mse_loss_layer_005": 0.013, "value_mse_loss_layer_006": 0.014709, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.025513, "value_mse_loss_layer_010": 0.02063, "value_mse_loss_layer_011": 0.021851, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.025513, "value_mse_loss_layer_014": 0.024902, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.022583, "value_mse_loss_layer_017": 0.0271, "value_mse_loss_layer_018": 0.024658, "value_mse_loss_layer_019": 0.02771, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.050049, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.060791, "value_mse_loss_layer_028": 0.058105, "value_mse_loss_layer_029": 0.082031, "value_mse_loss_layer_030": 0.08252, "value_mse_loss_layer_031": 0.089355, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.000106, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.000166, "vq_loss_layer_007": 0.000239, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000286, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000542, "vq_loss_layer_013": 0.000523, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000519, "vq_loss_layer_017": 0.000496, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.000404, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.000546, "vq_loss_layer_026": 0.000687, "vq_loss_layer_027": 0.000854, "vq_loss_layer_028": 0.001297, "vq_loss_layer_029": 0.001686, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.006592 }, { "ce_loss": 2.32719, "epoch": 0.00409, "grad_norm": 0.003900784533470869, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.056149, "kv_vq_loss": 0.000644, "learning_rate": 0.0009029308270018354, "loss": 0.056784, "step": 4090, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.012085, "value_mse_loss_layer_003": 0.013611, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.012146, "value_mse_loss_layer_006": 0.015442, "value_mse_loss_layer_007": 0.015625, "value_mse_loss_layer_008": 0.018799, "value_mse_loss_layer_009": 0.02417, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.025391, "value_mse_loss_layer_015": 0.028076, "value_mse_loss_layer_016": 0.026245, "value_mse_loss_layer_017": 0.028076, "value_mse_loss_layer_018": 0.025269, "value_mse_loss_layer_019": 0.029297, "value_mse_loss_layer_020": 0.03125, "value_mse_loss_layer_021": 0.043945, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.040771, "value_mse_loss_layer_024": 0.042725, "value_mse_loss_layer_025": 0.056396, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.062256, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000202, "vq_loss_layer_007": 0.000227, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000607, "vq_loss_layer_022": 0.000317, "vq_loss_layer_023": 0.000378, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000414, "vq_loss_layer_026": 0.00071, "vq_loss_layer_027": 0.000751, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001381, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.005737 }, { "ce_loss": 2.349542, "epoch": 0.0041, "grad_norm": 0.004176249727606773, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05556, "kv_vq_loss": 0.000614, "learning_rate": 0.0009031959641799338, "loss": 0.056146, "step": 4100, "value_mse_loss_layer_000": 0.00074, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.013245, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.014404, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.020386, "value_mse_loss_layer_009": 0.02417, "value_mse_loss_layer_010": 0.021851, "value_mse_loss_layer_011": 0.022217, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.023926, "value_mse_loss_layer_014": 0.026001, "value_mse_loss_layer_015": 0.028198, "value_mse_loss_layer_016": 0.023926, "value_mse_loss_layer_017": 0.028198, "value_mse_loss_layer_018": 0.026367, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.040283, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.062256, "value_mse_loss_layer_026": 0.049561, "value_mse_loss_layer_027": 0.058838, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.080078, "value_mse_loss_layer_031": 0.092773, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.00023, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000448, "vq_loss_layer_013": 0.000378, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000483, "vq_loss_layer_017": 0.00046, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000265, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.00034, "vq_loss_layer_023": 0.000465, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.00046, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.001442, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.005341 }, { "ce_loss": 2.266434, "epoch": 0.00411, "grad_norm": 0.0043711536563932896, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.061768, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.058594, "kv_mse_loss": 0.056207, "kv_vq_loss": 0.00064, "learning_rate": 0.0009034604554690171, "loss": 0.056836, "step": 4110, "value_mse_loss_layer_000": 0.000702, "value_mse_loss_layer_001": 0.002228, "value_mse_loss_layer_002": 0.008728, "value_mse_loss_layer_003": 0.013184, "value_mse_loss_layer_004": 0.012817, "value_mse_loss_layer_005": 0.012451, "value_mse_loss_layer_006": 0.014526, "value_mse_loss_layer_007": 0.016113, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.025757, "value_mse_loss_layer_010": 0.021118, "value_mse_loss_layer_011": 0.022217, "value_mse_loss_layer_012": 0.025513, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.026855, "value_mse_loss_layer_016": 0.021729, "value_mse_loss_layer_017": 0.028198, "value_mse_loss_layer_018": 0.024292, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.029663, "value_mse_loss_layer_021": 0.036133, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.050781, "value_mse_loss_layer_026": 0.042969, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.097168, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.6e-05, "vq_loss_layer_002": 2.7e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.00029, "vq_loss_layer_009": 0.000383, "vq_loss_layer_010": 0.000359, "vq_loss_layer_011": 0.000336, "vq_loss_layer_012": 0.00074, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000614, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000584, "vq_loss_layer_018": 0.000269, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.000702, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000446, "vq_loss_layer_025": 0.00058, "vq_loss_layer_026": 0.000729, "vq_loss_layer_027": 0.000664, "vq_loss_layer_028": 0.001236, "vq_loss_layer_029": 0.002457, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.008606 }, { "ce_loss": 2.300453, "epoch": 0.00412, "grad_norm": 0.00436311075463891, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.056329, "kv_vq_loss": 0.000651, "learning_rate": 0.0009037243040082836, "loss": 0.056955, "step": 4120, "value_mse_loss_layer_000": 0.000721, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.011841, "value_mse_loss_layer_006": 0.014832, "value_mse_loss_layer_007": 0.016479, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.024048, "value_mse_loss_layer_010": 0.024536, "value_mse_loss_layer_011": 0.022461, "value_mse_loss_layer_012": 0.022705, "value_mse_loss_layer_013": 0.027222, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.028198, "value_mse_loss_layer_016": 0.022949, "value_mse_loss_layer_017": 0.028564, "value_mse_loss_layer_018": 0.026611, "value_mse_loss_layer_019": 0.032227, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.043457, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.054199, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.071289, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.085938, "value_mse_loss_layer_030": 0.079102, "value_mse_loss_layer_031": 0.089355, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000173, "vq_loss_layer_007": 0.000257, "vq_loss_layer_008": 0.000224, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000328, "vq_loss_layer_011": 0.000313, "vq_loss_layer_012": 0.000473, "vq_loss_layer_013": 0.000511, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.000446, "vq_loss_layer_017": 0.000456, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.00023, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000458, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000862, "vq_loss_layer_028": 0.001534, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.3092, "epoch": 0.00413, "grad_norm": 0.0034609446302056313, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.061035, "kv_mse_loss": 0.056299, "kv_vq_loss": 0.00065, "learning_rate": 0.0009039875129141002, "loss": 0.056955, "step": 4130, "value_mse_loss_layer_000": 0.000706, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.013916, "value_mse_loss_layer_004": 0.013, "value_mse_loss_layer_005": 0.012512, "value_mse_loss_layer_006": 0.01416, "value_mse_loss_layer_007": 0.015747, "value_mse_loss_layer_008": 0.019531, "value_mse_loss_layer_009": 0.024658, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.022095, "value_mse_loss_layer_012": 0.022705, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.022217, "value_mse_loss_layer_017": 0.026367, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.029175, "value_mse_loss_layer_020": 0.029297, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.039062, "value_mse_loss_layer_024": 0.049316, "value_mse_loss_layer_025": 0.049805, "value_mse_loss_layer_026": 0.050049, "value_mse_loss_layer_027": 0.067383, "value_mse_loss_layer_028": 0.066895, "value_mse_loss_layer_029": 0.089355, "value_mse_loss_layer_030": 0.085449, "value_mse_loss_layer_031": 0.097168, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 4.9e-05, "vq_loss_layer_004": 9.1e-05, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.000155, "vq_loss_layer_007": 0.000219, "vq_loss_layer_008": 0.000288, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000298, "vq_loss_layer_011": 0.00034, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000431, "vq_loss_layer_014": 0.000584, "vq_loss_layer_015": 0.000462, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000425, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.000263, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000572, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000479, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000694, "vq_loss_layer_027": 0.000767, "vq_loss_layer_028": 0.001534, "vq_loss_layer_029": 0.002029, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.007568 }, { "ce_loss": 2.282061, "epoch": 0.00414, "grad_norm": 0.004420093726366758, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.055923, "kv_vq_loss": 0.000628, "learning_rate": 0.0009042500852802246, "loss": 0.056528, "step": 4140, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.008972, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.014465, "value_mse_loss_layer_007": 0.015564, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.025146, "value_mse_loss_layer_010": 0.02124, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.022583, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.025757, "value_mse_loss_layer_015": 0.028442, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.027954, "value_mse_loss_layer_018": 0.0271, "value_mse_loss_layer_019": 0.029175, "value_mse_loss_layer_020": 0.040283, "value_mse_loss_layer_021": 0.036133, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.039795, "value_mse_loss_layer_024": 0.051514, "value_mse_loss_layer_025": 0.055176, "value_mse_loss_layer_026": 0.044922, "value_mse_loss_layer_027": 0.05957, "value_mse_loss_layer_028": 0.060303, "value_mse_loss_layer_029": 0.084961, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 0.000102, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.00023, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000511, "vq_loss_layer_016": 0.000471, "vq_loss_layer_017": 0.000427, "vq_loss_layer_018": 0.000288, "vq_loss_layer_019": 0.000222, "vq_loss_layer_020": 0.000372, "vq_loss_layer_021": 0.000465, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000427, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.005768 }, { "ce_loss": 2.319931, "epoch": 0.00415, "grad_norm": 0.0053113168105483055, "key_mse_loss_layer_000": 0.003891, "key_mse_loss_layer_001": 0.011719, "key_mse_loss_layer_002": 0.060547, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.065918, "key_mse_loss_layer_006": 0.07373, "key_mse_loss_layer_007": 0.082031, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.109375, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.108887, "key_mse_loss_layer_021": 0.104004, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.093262, "key_mse_loss_layer_027": 0.092773, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.055994, "kv_vq_loss": 0.000625, "learning_rate": 0.0009045120241780232, "loss": 0.056595, "step": 4150, "value_mse_loss_layer_000": 0.000744, "value_mse_loss_layer_001": 0.002167, "value_mse_loss_layer_002": 0.009277, "value_mse_loss_layer_003": 0.017456, "value_mse_loss_layer_004": 0.015137, "value_mse_loss_layer_005": 0.013306, "value_mse_loss_layer_006": 0.015503, "value_mse_loss_layer_007": 0.016846, "value_mse_loss_layer_008": 0.02002, "value_mse_loss_layer_009": 0.025391, "value_mse_loss_layer_010": 0.022583, "value_mse_loss_layer_011": 0.023315, "value_mse_loss_layer_012": 0.025879, "value_mse_loss_layer_013": 0.026001, "value_mse_loss_layer_014": 0.02832, "value_mse_loss_layer_015": 0.029053, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.029907, "value_mse_loss_layer_018": 0.026245, "value_mse_loss_layer_019": 0.030273, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.037109, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.044922, "value_mse_loss_layer_025": 0.053467, "value_mse_loss_layer_026": 0.047119, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.061768, "value_mse_loss_layer_029": 0.112793, "value_mse_loss_layer_030": 0.09082, "value_mse_loss_layer_031": 0.100098, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 5.3e-05, "vq_loss_layer_004": 0.000181, "vq_loss_layer_005": 0.00014, "vq_loss_layer_006": 0.000231, "vq_loss_layer_007": 0.000277, "vq_loss_layer_008": 0.000294, "vq_loss_layer_009": 0.000351, "vq_loss_layer_010": 0.000328, "vq_loss_layer_011": 0.000357, "vq_loss_layer_012": 0.000633, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.000633, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.00058, "vq_loss_layer_018": 0.000309, "vq_loss_layer_019": 0.000299, "vq_loss_layer_020": 0.000395, "vq_loss_layer_021": 0.000546, "vq_loss_layer_022": 0.000393, "vq_loss_layer_023": 0.00038, "vq_loss_layer_024": 0.000452, "vq_loss_layer_025": 0.000599, "vq_loss_layer_026": 0.000786, "vq_loss_layer_027": 0.000999, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.002502, "vq_loss_layer_030": 0.003937, "vq_loss_layer_031": 0.007202 }, { "ce_loss": 2.308833, "epoch": 0.00416, "grad_norm": 0.0036693259608000517, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.056036, "kv_vq_loss": 0.000646, "learning_rate": 0.0009047733326566855, "loss": 0.056668, "step": 4160, "value_mse_loss_layer_000": 0.000729, "value_mse_loss_layer_001": 0.002335, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.012451, "value_mse_loss_layer_006": 0.014343, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.02478, "value_mse_loss_layer_010": 0.020996, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.022949, "value_mse_loss_layer_013": 0.027222, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.028076, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.026367, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.030762, "value_mse_loss_layer_021": 0.037109, "value_mse_loss_layer_022": 0.038086, "value_mse_loss_layer_023": 0.045166, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.054932, "value_mse_loss_layer_026": 0.047119, "value_mse_loss_layer_027": 0.06543, "value_mse_loss_layer_028": 0.066406, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.095215, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.7e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 9.6e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000259, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000341, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000568, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000584, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000265, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000557, "vq_loss_layer_022": 0.000414, "vq_loss_layer_023": 0.000462, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000698, "vq_loss_layer_027": 0.000839, "vq_loss_layer_028": 0.00132, "vq_loss_layer_029": 0.001747, "vq_loss_layer_030": 0.003372, "vq_loss_layer_031": 0.007233 }, { "ce_loss": 2.295851, "epoch": 0.00417, "grad_norm": 0.004009204916656017, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.055975, "kv_vq_loss": 0.00064, "learning_rate": 0.0009050340137434393, "loss": 0.056595, "step": 4170, "value_mse_loss_layer_000": 0.000732, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.012085, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.015503, "value_mse_loss_layer_008": 0.018311, "value_mse_loss_layer_009": 0.024048, "value_mse_loss_layer_010": 0.02063, "value_mse_loss_layer_011": 0.021606, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.026245, "value_mse_loss_layer_015": 0.029053, "value_mse_loss_layer_016": 0.025024, "value_mse_loss_layer_017": 0.027832, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.029419, "value_mse_loss_layer_020": 0.030884, "value_mse_loss_layer_021": 0.037354, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.039795, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.049316, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.059082, "value_mse_loss_layer_028": 0.059814, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.088867, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000237, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000469, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000515, "vq_loss_layer_015": 0.000587, "vq_loss_layer_016": 0.000523, "vq_loss_layer_017": 0.000422, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.00023, "vq_loss_layer_021": 0.000492, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000683, "vq_loss_layer_027": 0.000694, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.288348, "epoch": 0.00418, "grad_norm": 0.003073751926422119, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.09082, "key_mse_loss_layer_009": 0.099609, "key_mse_loss_layer_010": 0.111328, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.134766, "key_mse_loss_layer_014": 0.131836, "key_mse_loss_layer_015": 0.119141, "key_mse_loss_layer_016": 0.115234, "key_mse_loss_layer_017": 0.113281, "key_mse_loss_layer_018": 0.12207, "key_mse_loss_layer_019": 0.099121, "key_mse_loss_layer_020": 0.114258, "key_mse_loss_layer_021": 0.10791, "key_mse_loss_layer_022": 0.113281, "key_mse_loss_layer_023": 0.108398, "key_mse_loss_layer_024": 0.087891, "key_mse_loss_layer_025": 0.082031, "key_mse_loss_layer_026": 0.097168, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.102051, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.106934, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.055865, "kv_vq_loss": 0.000619, "learning_rate": 0.0009052940704437585, "loss": 0.056476, "step": 4180, "value_mse_loss_layer_000": 0.000751, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.009644, "value_mse_loss_layer_003": 0.013489, "value_mse_loss_layer_004": 0.013367, "value_mse_loss_layer_005": 0.012451, "value_mse_loss_layer_006": 0.014282, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.020508, "value_mse_loss_layer_009": 0.024292, "value_mse_loss_layer_010": 0.020874, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.029785, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.025146, "value_mse_loss_layer_016": 0.021484, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.024292, "value_mse_loss_layer_019": 0.026245, "value_mse_loss_layer_020": 0.029907, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.046143, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.07959, "value_mse_loss_layer_031": 0.087402, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.000233, "vq_loss_layer_008": 0.000368, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000309, "vq_loss_layer_011": 0.000307, "vq_loss_layer_012": 0.00106, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.000483, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.000599, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.000437, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.000969, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.006531 }, { "ce_loss": 2.297602, "epoch": 0.00419, "grad_norm": 0.007120147347450256, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.0561, "kv_vq_loss": 0.00063, "learning_rate": 0.0009055535057415736, "loss": 0.056714, "step": 4190, "value_mse_loss_layer_000": 0.000763, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.012878, "value_mse_loss_layer_004": 0.011841, "value_mse_loss_layer_005": 0.013123, "value_mse_loss_layer_006": 0.014282, "value_mse_loss_layer_007": 0.015442, "value_mse_loss_layer_008": 0.018799, "value_mse_loss_layer_009": 0.025146, "value_mse_loss_layer_010": 0.02124, "value_mse_loss_layer_011": 0.022827, "value_mse_loss_layer_012": 0.024658, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.027222, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.024414, "value_mse_loss_layer_017": 0.029297, "value_mse_loss_layer_018": 0.026245, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.042969, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.121582, "value_mse_loss_layer_030": 0.080078, "value_mse_loss_layer_031": 0.09082, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 0.000144, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000215, "vq_loss_layer_008": 0.000199, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000622, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000439, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000425, "vq_loss_layer_017": 0.000427, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.001343, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.005371 }, { "ce_loss": 2.313392, "epoch": 0.0042, "grad_norm": 0.0038278158754110336, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.055869, "kv_vq_loss": 0.000632, "learning_rate": 0.000905812322599475, "loss": 0.056488, "step": 4200, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.013245, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.012268, "value_mse_loss_layer_006": 0.014343, "value_mse_loss_layer_007": 0.016602, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.025391, "value_mse_loss_layer_010": 0.020508, "value_mse_loss_layer_011": 0.021729, "value_mse_loss_layer_012": 0.022705, "value_mse_loss_layer_013": 0.025391, "value_mse_loss_layer_014": 0.025269, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.023071, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.028809, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.037354, "value_mse_loss_layer_022": 0.034668, "value_mse_loss_layer_023": 0.04126, "value_mse_loss_layer_024": 0.042236, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.056396, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.086426, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 0.000111, "vq_loss_layer_006": 0.000164, "vq_loss_layer_007": 0.000271, "vq_loss_layer_008": 0.000218, "vq_loss_layer_009": 0.00036, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000463, "vq_loss_layer_013": 0.000452, "vq_loss_layer_014": 0.000484, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.00045, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000471, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000416, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.001518, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.321563, "epoch": 0.00421, "grad_norm": 0.0035994169302284718, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.055899, "kv_vq_loss": 0.00062, "learning_rate": 0.0009060705239589171, "loss": 0.056485, "step": 4210, "value_mse_loss_layer_000": 0.000729, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.01416, "value_mse_loss_layer_004": 0.012634, "value_mse_loss_layer_005": 0.012268, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.019653, "value_mse_loss_layer_009": 0.024414, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.021729, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.023926, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.028198, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.028198, "value_mse_loss_layer_018": 0.026245, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.040283, "value_mse_loss_layer_022": 0.037354, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.053467, "value_mse_loss_layer_026": 0.053467, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.068848, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.092773, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.00015, "vq_loss_layer_007": 0.000237, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000324, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000542, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.000303, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000397, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000816, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001526, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.006012 }, { "ce_loss": 2.326022, "epoch": 0.00422, "grad_norm": 0.0036015387158840895, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.055594, "kv_vq_loss": 0.000601, "learning_rate": 0.0009063281127404184, "loss": 0.056161, "step": 4220, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.013855, "value_mse_loss_layer_004": 0.011719, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.014099, "value_mse_loss_layer_007": 0.015503, "value_mse_loss_layer_008": 0.018921, "value_mse_loss_layer_009": 0.024048, "value_mse_loss_layer_010": 0.022217, "value_mse_loss_layer_011": 0.021606, "value_mse_loss_layer_012": 0.022217, "value_mse_loss_layer_013": 0.023438, "value_mse_loss_layer_014": 0.024536, "value_mse_loss_layer_015": 0.02832, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.028076, "value_mse_loss_layer_018": 0.025757, "value_mse_loss_layer_019": 0.03064, "value_mse_loss_layer_020": 0.033691, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.041016, "value_mse_loss_layer_024": 0.041504, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.04834, "value_mse_loss_layer_027": 0.061768, "value_mse_loss_layer_028": 0.061279, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.087402, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000448, "vq_loss_layer_013": 0.000347, "vq_loss_layer_014": 0.000437, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.000471, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000393, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.323081, "epoch": 0.00423, "grad_norm": 0.003854613984003663, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.088867, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.055731, "kv_vq_loss": 0.000612, "learning_rate": 0.0009065850918437606, "loss": 0.056317, "step": 4230, "value_mse_loss_layer_000": 0.000702, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.012756, "value_mse_loss_layer_004": 0.011841, "value_mse_loss_layer_005": 0.011963, "value_mse_loss_layer_006": 0.014893, "value_mse_loss_layer_007": 0.015625, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.025146, "value_mse_loss_layer_010": 0.019897, "value_mse_loss_layer_011": 0.021484, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.025146, "value_mse_loss_layer_014": 0.026489, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.023071, "value_mse_loss_layer_017": 0.028564, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.029175, "value_mse_loss_layer_020": 0.030518, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.040771, "value_mse_loss_layer_024": 0.04541, "value_mse_loss_layer_025": 0.050049, "value_mse_loss_layer_026": 0.046387, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.097656, "value_mse_loss_layer_030": 0.080566, "value_mse_loss_layer_031": 0.090332, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000203, "vq_loss_layer_007": 0.000226, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000351, "vq_loss_layer_010": 0.000228, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000454, "vq_loss_layer_013": 0.000422, "vq_loss_layer_014": 0.000483, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000435, "vq_loss_layer_017": 0.000484, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000441, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000303, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000664, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.289664, "epoch": 0.00424, "grad_norm": 0.005073556210845709, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.056015, "kv_vq_loss": 0.000645, "learning_rate": 0.000906841464148183, "loss": 0.056656, "step": 4240, "value_mse_loss_layer_000": 0.000702, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.009521, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.012939, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.014526, "value_mse_loss_layer_007": 0.016113, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.02356, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.021851, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.025879, "value_mse_loss_layer_017": 0.026001, "value_mse_loss_layer_018": 0.026733, "value_mse_loss_layer_019": 0.027466, "value_mse_loss_layer_020": 0.029907, "value_mse_loss_layer_021": 0.035645, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.039795, "value_mse_loss_layer_024": 0.04126, "value_mse_loss_layer_025": 0.064941, "value_mse_loss_layer_026": 0.047119, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.086914, "value_mse_loss_layer_030": 0.094727, "value_mse_loss_layer_031": 0.092773, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 0.00011, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000235, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000277, "vq_loss_layer_011": 0.000374, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000359, "vq_loss_layer_014": 0.000483, "vq_loss_layer_015": 0.000511, "vq_loss_layer_016": 0.000584, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000385, "vq_loss_layer_019": 0.000243, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000446, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000671, "vq_loss_layer_026": 0.000748, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.00164, "vq_loss_layer_030": 0.00325, "vq_loss_layer_031": 0.006836 }, { "ce_loss": 2.272534, "epoch": 0.00425, "grad_norm": 0.003816853743046522, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.056052, "kv_vq_loss": 0.000638, "learning_rate": 0.0009070972325125778, "loss": 0.056702, "step": 4250, "value_mse_loss_layer_000": 0.000721, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.008545, "value_mse_loss_layer_003": 0.013184, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.014526, "value_mse_loss_layer_007": 0.015747, "value_mse_loss_layer_008": 0.019165, "value_mse_loss_layer_009": 0.023926, "value_mse_loss_layer_010": 0.022461, "value_mse_loss_layer_011": 0.021729, "value_mse_loss_layer_012": 0.025146, "value_mse_loss_layer_013": 0.023682, "value_mse_loss_layer_014": 0.025391, "value_mse_loss_layer_015": 0.028076, "value_mse_loss_layer_016": 0.024536, "value_mse_loss_layer_017": 0.029541, "value_mse_loss_layer_018": 0.025146, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.040527, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.042725, "value_mse_loss_layer_025": 0.051758, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.060547, "value_mse_loss_layer_028": 0.059814, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.092773, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000162, "vq_loss_layer_007": 0.000223, "vq_loss_layer_008": 0.000265, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.000671, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000538, "vq_loss_layer_017": 0.000526, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000565, "vq_loss_layer_022": 0.000401, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.00079, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.00161, "vq_loss_layer_030": 0.003433, "vq_loss_layer_031": 0.005859 }, { "ce_loss": 2.29341, "epoch": 0.00426, "grad_norm": 0.003081164788454771, "key_mse_loss_layer_000": 0.004395, "key_mse_loss_layer_001": 0.011597, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.055878, "kv_vq_loss": 0.000621, "learning_rate": 0.0009073523997756796, "loss": 0.056488, "step": 4260, "value_mse_loss_layer_000": 0.000725, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.013123, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.012512, "value_mse_loss_layer_006": 0.013977, "value_mse_loss_layer_007": 0.015381, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.02417, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.020874, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.023193, "value_mse_loss_layer_014": 0.024048, "value_mse_loss_layer_015": 0.026367, "value_mse_loss_layer_016": 0.022217, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.028809, "value_mse_loss_layer_020": 0.031128, "value_mse_loss_layer_021": 0.034912, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.041504, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.050537, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.062256, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.085938, "value_mse_loss_layer_030": 0.083496, "value_mse_loss_layer_031": 0.09375, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 0.00011, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000239, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.000336, "vq_loss_layer_010": 0.000239, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.000397, "vq_loss_layer_014": 0.000465, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.000423, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000218, "vq_loss_layer_020": 0.000243, "vq_loss_layer_021": 0.000425, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.00034, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000725, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.293727, "epoch": 0.00427, "grad_norm": 0.004522582516074181, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.055676, "kv_vq_loss": 0.000603, "learning_rate": 0.0009076069687562559, "loss": 0.05625, "step": 4270, "value_mse_loss_layer_000": 0.000729, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.016113, "value_mse_loss_layer_004": 0.011841, "value_mse_loss_layer_005": 0.012512, "value_mse_loss_layer_006": 0.014465, "value_mse_loss_layer_007": 0.015747, "value_mse_loss_layer_008": 0.019165, "value_mse_loss_layer_009": 0.024536, "value_mse_loss_layer_010": 0.020874, "value_mse_loss_layer_011": 0.022705, "value_mse_loss_layer_012": 0.022583, "value_mse_loss_layer_013": 0.02478, "value_mse_loss_layer_014": 0.026855, "value_mse_loss_layer_015": 0.03125, "value_mse_loss_layer_016": 0.026245, "value_mse_loss_layer_017": 0.028198, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.029907, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.044189, "value_mse_loss_layer_025": 0.060303, "value_mse_loss_layer_026": 0.053467, "value_mse_loss_layer_027": 0.060547, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.103516, "value_mse_loss_layer_030": 0.088867, "value_mse_loss_layer_031": 0.09375, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.000232, "vq_loss_layer_008": 0.000236, "vq_loss_layer_009": 0.000294, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.000456, "vq_loss_layer_013": 0.000402, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000422, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.00036, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000376, "vq_loss_layer_026": 0.000671, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.336526, "epoch": 0.00428, "grad_norm": 0.003698906395584345, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.040527, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.136719, "key_mse_loss_layer_014": 0.132812, "key_mse_loss_layer_015": 0.121094, "key_mse_loss_layer_016": 0.115723, "key_mse_loss_layer_017": 0.11377, "key_mse_loss_layer_018": 0.120605, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.110352, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.10791, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.055594, "kv_vq_loss": 0.00063, "learning_rate": 0.0009078609422532929, "loss": 0.056198, "step": 4280, "value_mse_loss_layer_000": 0.000717, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.014099, "value_mse_loss_layer_007": 0.015381, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.024292, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.022583, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.024292, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.020874, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.022705, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.030396, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.045898, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.052002, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.088867, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4.4e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000169, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000307, "vq_loss_layer_011": 0.000336, "vq_loss_layer_012": 0.000626, "vq_loss_layer_013": 0.000351, "vq_loss_layer_014": 0.000568, "vq_loss_layer_015": 0.000483, "vq_loss_layer_016": 0.000463, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000232, "vq_loss_layer_020": 0.00041, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.000395, "vq_loss_layer_024": 0.000431, "vq_loss_layer_025": 0.000546, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.001511, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.007172 }, { "ce_loss": 2.310284, "epoch": 0.00429, "grad_norm": 0.004294794052839279, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.055722, "kv_vq_loss": 0.000614, "learning_rate": 0.000908114323046181, "loss": 0.056317, "step": 4290, "value_mse_loss_layer_000": 0.000809, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.009338, "value_mse_loss_layer_003": 0.012878, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.011475, "value_mse_loss_layer_006": 0.013855, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.023193, "value_mse_loss_layer_010": 0.019897, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.021973, "value_mse_loss_layer_013": 0.023438, "value_mse_loss_layer_014": 0.023926, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.022827, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.026367, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.030396, "value_mse_loss_layer_021": 0.041016, "value_mse_loss_layer_022": 0.036377, "value_mse_loss_layer_023": 0.039551, "value_mse_loss_layer_024": 0.045166, "value_mse_loss_layer_025": 0.055664, "value_mse_loss_layer_026": 0.044922, "value_mse_loss_layer_027": 0.057861, "value_mse_loss_layer_028": 0.060547, "value_mse_loss_layer_029": 0.092285, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.088379, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.000271, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000265, "vq_loss_layer_010": 0.000239, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000479, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.000446, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000553, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.004639 }, { "ce_loss": 2.319887, "epoch": 0.0043, "grad_norm": 0.002827730029821396, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.080078, "kv_mse_loss": 0.055914, "kv_vq_loss": 0.000611, "learning_rate": 0.0009083671138948964, "loss": 0.056503, "step": 4300, "value_mse_loss_layer_000": 0.000721, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008545, "value_mse_loss_layer_003": 0.013062, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.012268, "value_mse_loss_layer_006": 0.014893, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.024048, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.02356, "value_mse_loss_layer_012": 0.021973, "value_mse_loss_layer_013": 0.023682, "value_mse_loss_layer_014": 0.027222, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.024536, "value_mse_loss_layer_017": 0.028564, "value_mse_loss_layer_018": 0.024658, "value_mse_loss_layer_019": 0.030396, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.035645, "value_mse_loss_layer_023": 0.038574, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.050293, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.060303, "value_mse_loss_layer_029": 0.087402, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.090332, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000183, "vq_loss_layer_007": 0.000226, "vq_loss_layer_008": 0.000227, "vq_loss_layer_009": 0.000326, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000425, "vq_loss_layer_012": 0.000481, "vq_loss_layer_013": 0.000431, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.000603, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.001762, "vq_loss_layer_030": 0.002991, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.310493, "epoch": 0.00431, "grad_norm": 0.003695680294185877, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.055664, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.055801, "kv_vq_loss": 0.000623, "learning_rate": 0.0009086193175401828, "loss": 0.056412, "step": 4310, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.00824, "value_mse_loss_layer_003": 0.014832, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.012146, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.01532, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.02417, "value_mse_loss_layer_010": 0.020508, "value_mse_loss_layer_011": 0.021729, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.023682, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.029419, "value_mse_loss_layer_016": 0.023438, "value_mse_loss_layer_017": 0.028198, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.028687, "value_mse_loss_layer_020": 0.03064, "value_mse_loss_layer_021": 0.043457, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.052246, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.062012, "value_mse_loss_layer_029": 0.084961, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.088379, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.00015, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000267, "vq_loss_layer_011": 0.000299, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000402, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000645, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000542, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000214, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.00069, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000393, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001358, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.005676 }, { "ce_loss": 2.352152, "epoch": 0.00432, "grad_norm": 0.004960455000400543, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.055786, "kv_vq_loss": 0.000612, "learning_rate": 0.0009088709367037279, "loss": 0.056381, "step": 4320, "value_mse_loss_layer_000": 0.00071, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.013611, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.019287, "value_mse_loss_layer_009": 0.024536, "value_mse_loss_layer_010": 0.020386, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.023682, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.028442, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.030396, "value_mse_loss_layer_019": 0.028442, "value_mse_loss_layer_020": 0.033936, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.0354, "value_mse_loss_layer_023": 0.04248, "value_mse_loss_layer_024": 0.040283, "value_mse_loss_layer_025": 0.056152, "value_mse_loss_layer_026": 0.042969, "value_mse_loss_layer_027": 0.055664, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.09668, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.00025, "vq_loss_layer_008": 0.000278, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000443, "vq_loss_layer_013": 0.000385, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000572, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000404, "vq_loss_layer_018": 0.000441, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.00032, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.000362, "vq_loss_layer_023": 0.000469, "vq_loss_layer_024": 0.000296, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.006989 }, { "ce_loss": 2.317235, "epoch": 0.00433, "grad_norm": 0.003318676259368658, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.055658, "kv_vq_loss": 0.000617, "learning_rate": 0.0009091219740883412, "loss": 0.05625, "step": 4330, "value_mse_loss_layer_000": 0.00071, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.009155, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.015747, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.023804, "value_mse_loss_layer_010": 0.019897, "value_mse_loss_layer_011": 0.021729, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.023926, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.027466, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.044434, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.051758, "value_mse_loss_layer_026": 0.041992, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.088379, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.00015, "vq_loss_layer_007": 0.000271, "vq_loss_layer_008": 0.000225, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.000226, "vq_loss_layer_011": 0.000301, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000364, "vq_loss_layer_014": 0.000471, "vq_loss_layer_015": 0.000462, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000481, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000448, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.290012, "epoch": 0.00434, "grad_norm": 0.0040342481806874275, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.055722, "kv_vq_loss": 0.000628, "learning_rate": 0.0009093724323781276, "loss": 0.056332, "step": 4340, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.002197, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.013, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.012634, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.015076, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.02063, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.021118, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.025757, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.022461, "value_mse_loss_layer_017": 0.027588, "value_mse_loss_layer_018": 0.025024, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.034424, "value_mse_loss_layer_021": 0.036621, "value_mse_loss_layer_022": 0.035156, "value_mse_loss_layer_023": 0.040771, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.061035, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.095215, "value_mse_loss_layer_030": 0.08252, "value_mse_loss_layer_031": 0.090332, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000214, "vq_loss_layer_008": 0.00024, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000244, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000423, "vq_loss_layer_013": 0.000355, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000462, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000303, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.302954, "epoch": 0.00435, "grad_norm": 0.00393550144508481, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.055991, "kv_vq_loss": 0.000622, "learning_rate": 0.0009096223142386593, "loss": 0.05658, "step": 4350, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.012756, "value_mse_loss_layer_004": 0.013855, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.01416, "value_mse_loss_layer_007": 0.014832, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.02356, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.027588, "value_mse_loss_layer_016": 0.022827, "value_mse_loss_layer_017": 0.025269, "value_mse_loss_layer_018": 0.024536, "value_mse_loss_layer_019": 0.028442, "value_mse_loss_layer_020": 0.029907, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.045654, "value_mse_loss_layer_025": 0.057373, "value_mse_loss_layer_026": 0.048828, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.080566, "value_mse_loss_layer_031": 0.090332, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 0.00016, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.000303, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000433, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000463, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000387, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.005463 }, { "ce_loss": 2.292097, "epoch": 0.00436, "grad_norm": 0.0032170440535992384, "key_mse_loss_layer_000": 0.003693, "key_mse_loss_layer_001": 0.010986, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.055234, "kv_vq_loss": 0.000619, "learning_rate": 0.0009098716223171463, "loss": 0.055823, "step": 4360, "value_mse_loss_layer_000": 0.000755, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.00824, "value_mse_loss_layer_003": 0.013733, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.013855, "value_mse_loss_layer_007": 0.015564, "value_mse_loss_layer_008": 0.018921, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.022339, "value_mse_loss_layer_011": 0.021484, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.026367, "value_mse_loss_layer_016": 0.023071, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.027588, "value_mse_loss_layer_019": 0.028809, "value_mse_loss_layer_020": 0.029663, "value_mse_loss_layer_021": 0.035645, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.03833, "value_mse_loss_layer_024": 0.04126, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.063477, "value_mse_loss_layer_028": 0.059326, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.09082, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000157, "vq_loss_layer_007": 0.000256, "vq_loss_layer_008": 0.000267, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000465, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000475, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000454, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000813, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.322962, "epoch": 0.00437, "grad_norm": 0.004123267717659473, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.055615, "kv_vq_loss": 0.000614, "learning_rate": 0.0009101203592426053, "loss": 0.056201, "step": 4370, "value_mse_loss_layer_000": 0.000725, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.008728, "value_mse_loss_layer_003": 0.01355, "value_mse_loss_layer_004": 0.012207, "value_mse_loss_layer_005": 0.012329, "value_mse_loss_layer_006": 0.014343, "value_mse_loss_layer_007": 0.015564, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.024658, "value_mse_loss_layer_010": 0.02063, "value_mse_loss_layer_011": 0.02124, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.023804, "value_mse_loss_layer_014": 0.025879, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.029785, "value_mse_loss_layer_018": 0.025146, "value_mse_loss_layer_019": 0.028076, "value_mse_loss_layer_020": 0.030029, "value_mse_loss_layer_021": 0.042725, "value_mse_loss_layer_022": 0.034668, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.050781, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.055664, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.087891, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000161, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000246, "vq_loss_layer_009": 0.000353, "vq_loss_layer_010": 0.000294, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000473, "vq_loss_layer_013": 0.000414, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000553, "vq_loss_layer_017": 0.000614, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000694, "vq_loss_layer_022": 0.000359, "vq_loss_layer_023": 0.000542, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.000463, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000725, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.006104 }, { "ce_loss": 2.256776, "epoch": 0.00438, "grad_norm": 0.0038791841361671686, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.055765, "kv_vq_loss": 0.000636, "learning_rate": 0.0009103685276260248, "loss": 0.056375, "step": 4380, "value_mse_loss_layer_000": 0.000698, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.013062, "value_mse_loss_layer_004": 0.012512, "value_mse_loss_layer_005": 0.01178, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015442, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.022461, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.023438, "value_mse_loss_layer_014": 0.02417, "value_mse_loss_layer_015": 0.027344, "value_mse_loss_layer_016": 0.021484, "value_mse_loss_layer_017": 0.026001, "value_mse_loss_layer_018": 0.022949, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.035645, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.087891, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000155, "vq_loss_layer_007": 0.000232, "vq_loss_layer_008": 0.000273, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000431, "vq_loss_layer_012": 0.000473, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000475, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000465, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000353, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000467, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.006073 }, { "ce_loss": 2.326266, "epoch": 0.00439, "grad_norm": 0.0038959812372922897, "key_mse_loss_layer_000": 0.002609, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052002, "key_mse_loss_layer_003": 0.043213, "key_mse_loss_layer_004": 0.040771, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.055273, "kv_vq_loss": 0.000606, "learning_rate": 0.0009106161300605301, "loss": 0.055853, "step": 4390, "value_mse_loss_layer_000": 0.000721, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.00824, "value_mse_loss_layer_003": 0.014099, "value_mse_loss_layer_004": 0.012451, "value_mse_loss_layer_005": 0.011963, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.024536, "value_mse_loss_layer_010": 0.022095, "value_mse_loss_layer_011": 0.022705, "value_mse_loss_layer_012": 0.026245, "value_mse_loss_layer_013": 0.025024, "value_mse_loss_layer_014": 0.025269, "value_mse_loss_layer_015": 0.027222, "value_mse_loss_layer_016": 0.023193, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.030029, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.051758, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.057129, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.00032, "vq_loss_layer_011": 0.000341, "vq_loss_layer_012": 0.000702, "vq_loss_layer_013": 0.000397, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000481, "vq_loss_layer_017": 0.000416, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000448, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000553, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.001282, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.005798 }, { "ce_loss": 2.300414, "epoch": 0.0044, "grad_norm": 0.003143343375995755, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.055634, "kv_vq_loss": 0.00061, "learning_rate": 0.0009108631691215468, "loss": 0.056204, "step": 4400, "value_mse_loss_layer_000": 0.000744, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.013672, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.016846, "value_mse_loss_layer_008": 0.019409, "value_mse_loss_layer_009": 0.024414, "value_mse_loss_layer_010": 0.02063, "value_mse_loss_layer_011": 0.021484, "value_mse_loss_layer_012": 0.022827, "value_mse_loss_layer_013": 0.023926, "value_mse_loss_layer_014": 0.026245, "value_mse_loss_layer_015": 0.028809, "value_mse_loss_layer_016": 0.024902, "value_mse_loss_layer_017": 0.027832, "value_mse_loss_layer_018": 0.025635, "value_mse_loss_layer_019": 0.028809, "value_mse_loss_layer_020": 0.030884, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.0354, "value_mse_loss_layer_023": 0.040039, "value_mse_loss_layer_024": 0.043213, "value_mse_loss_layer_025": 0.052246, "value_mse_loss_layer_026": 0.041992, "value_mse_loss_layer_027": 0.055176, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.082031, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.088867, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 0.000126, "vq_loss_layer_006": 0.000167, "vq_loss_layer_007": 0.000286, "vq_loss_layer_008": 0.000277, "vq_loss_layer_009": 0.000298, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000483, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000565, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.00042, "vq_loss_layer_018": 0.000292, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.309845, "epoch": 0.00441, "grad_norm": 0.005099864676594734, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.055756, "kv_vq_loss": 0.000616, "learning_rate": 0.0009111096473669595, "loss": 0.056348, "step": 4410, "value_mse_loss_layer_000": 0.000732, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008606, "value_mse_loss_layer_003": 0.013, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.012146, "value_mse_loss_layer_006": 0.014526, "value_mse_loss_layer_007": 0.015381, "value_mse_loss_layer_008": 0.018799, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.022095, "value_mse_loss_layer_012": 0.021973, "value_mse_loss_layer_013": 0.023804, "value_mse_loss_layer_014": 0.025146, "value_mse_loss_layer_015": 0.027222, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.027222, "value_mse_loss_layer_018": 0.025635, "value_mse_loss_layer_019": 0.028076, "value_mse_loss_layer_020": 0.03064, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.033447, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.05127, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.055908, "value_mse_loss_layer_028": 0.061768, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.094238, "value_mse_loss_layer_031": 0.085449, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 0.000107, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000177, "vq_loss_layer_007": 0.000224, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000278, "vq_loss_layer_010": 0.00024, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.000446, "vq_loss_layer_013": 0.000402, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000433, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.00024, "vq_loss_layer_021": 0.000479, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000977, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.005035 }, { "ce_loss": 2.319285, "epoch": 0.00442, "grad_norm": 0.004719000309705734, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.055389, "kv_vq_loss": 0.000615, "learning_rate": 0.0009113555673372728, "loss": 0.055981, "step": 4420, "value_mse_loss_layer_000": 0.00071, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.012085, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.026001, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.020752, "value_mse_loss_layer_012": 0.022095, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.024292, "value_mse_loss_layer_015": 0.027222, "value_mse_loss_layer_016": 0.022827, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.026367, "value_mse_loss_layer_019": 0.028564, "value_mse_loss_layer_020": 0.030151, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.037109, "value_mse_loss_layer_023": 0.043945, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.054932, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.063965, "value_mse_loss_layer_028": 0.06543, "value_mse_loss_layer_029": 0.095215, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.096191, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000233, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000429, "vq_loss_layer_010": 0.000254, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000448, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000402, "vq_loss_layer_018": 0.00029, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000475, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000406, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000435, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000782, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001831, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.006317 }, { "ce_loss": 2.350257, "epoch": 0.00443, "grad_norm": 0.00423553166911006, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.055643, "kv_vq_loss": 0.000612, "learning_rate": 0.0009116009315557671, "loss": 0.056241, "step": 4430, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015442, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.023804, "value_mse_loss_layer_010": 0.021362, "value_mse_loss_layer_011": 0.021606, "value_mse_loss_layer_012": 0.022217, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.025146, "value_mse_loss_layer_015": 0.026978, "value_mse_loss_layer_016": 0.02417, "value_mse_loss_layer_017": 0.027466, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.030273, "value_mse_loss_layer_020": 0.04126, "value_mse_loss_layer_021": 0.036865, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.041016, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.052979, "value_mse_loss_layer_026": 0.041504, "value_mse_loss_layer_027": 0.053955, "value_mse_loss_layer_028": 0.058105, "value_mse_loss_layer_029": 0.084473, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000237, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000286, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.000454, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000467, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000422, "vq_loss_layer_018": 0.00024, "vq_loss_layer_019": 0.000292, "vq_loss_layer_020": 0.000404, "vq_loss_layer_021": 0.00046, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000412, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000372, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.258513, "epoch": 0.00444, "grad_norm": 0.003261048812419176, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.05578, "kv_vq_loss": 0.000609, "learning_rate": 0.0009118457425286548, "loss": 0.056372, "step": 4440, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.013428, "value_mse_loss_layer_004": 0.011719, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.013855, "value_mse_loss_layer_007": 0.014893, "value_mse_loss_layer_008": 0.018921, "value_mse_loss_layer_009": 0.023682, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.023315, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.02417, "value_mse_loss_layer_015": 0.031982, "value_mse_loss_layer_016": 0.023071, "value_mse_loss_layer_017": 0.027588, "value_mse_loss_layer_018": 0.025269, "value_mse_loss_layer_019": 0.028687, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.035645, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.04126, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.046143, "value_mse_loss_layer_027": 0.055664, "value_mse_loss_layer_028": 0.060303, "value_mse_loss_layer_029": 0.080566, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.086914, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000231, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000557, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000473, "vq_loss_layer_015": 0.000626, "vq_loss_layer_016": 0.000437, "vq_loss_layer_017": 0.000456, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000338, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.278692, "epoch": 0.00445, "grad_norm": 0.0039048411417752504, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.055356, "kv_vq_loss": 0.000598, "learning_rate": 0.0009120900027452328, "loss": 0.055917, "step": 4450, "value_mse_loss_layer_000": 0.000748, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008606, "value_mse_loss_layer_003": 0.012878, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.013916, "value_mse_loss_layer_007": 0.01532, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.027466, "value_mse_loss_layer_010": 0.020264, "value_mse_loss_layer_011": 0.021362, "value_mse_loss_layer_012": 0.022095, "value_mse_loss_layer_013": 0.024536, "value_mse_loss_layer_014": 0.024414, "value_mse_loss_layer_015": 0.028564, "value_mse_loss_layer_016": 0.023193, "value_mse_loss_layer_017": 0.026978, "value_mse_loss_layer_018": 0.025024, "value_mse_loss_layer_019": 0.028931, "value_mse_loss_layer_020": 0.03064, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.0354, "value_mse_loss_layer_023": 0.040771, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.045654, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.058838, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000222, "vq_loss_layer_009": 0.000507, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000454, "vq_loss_layer_013": 0.000406, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000435, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000439, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000387, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.000444, "vq_loss_layer_026": 0.000729, "vq_loss_layer_027": 0.00082, "vq_loss_layer_028": 0.001289, "vq_loss_layer_029": 0.003082, "vq_loss_layer_030": 0.003098, "vq_loss_layer_031": 0.0065 }, { "ce_loss": 2.333492, "epoch": 0.00446, "grad_norm": 0.004539312794804573, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.135742, "key_mse_loss_layer_014": 0.132812, "key_mse_loss_layer_015": 0.121094, "key_mse_loss_layer_016": 0.114258, "key_mse_loss_layer_017": 0.113281, "key_mse_loss_layer_018": 0.121582, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.108887, "key_mse_loss_layer_021": 0.10498, "key_mse_loss_layer_022": 0.111816, "key_mse_loss_layer_023": 0.107422, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.094727, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.097656, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.055545, "kv_vq_loss": 0.000603, "learning_rate": 0.0009123337146780353, "loss": 0.056122, "step": 4460, "value_mse_loss_layer_000": 0.000717, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.012756, "value_mse_loss_layer_004": 0.012817, "value_mse_loss_layer_005": 0.012024, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.015381, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.021362, "value_mse_loss_layer_012": 0.022217, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.026001, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.024048, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.023071, "value_mse_loss_layer_019": 0.025757, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.046631, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.085449, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 0.00011, "vq_loss_layer_005": 0.000106, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.000332, "vq_loss_layer_009": 0.000341, "vq_loss_layer_010": 0.000359, "vq_loss_layer_011": 0.00038, "vq_loss_layer_012": 0.000542, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.000675, "vq_loss_layer_015": 0.000481, "vq_loss_layer_016": 0.000614, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000698, "vq_loss_layer_022": 0.00034, "vq_loss_layer_023": 0.000492, "vq_loss_layer_024": 0.000526, "vq_loss_layer_025": 0.000687, "vq_loss_layer_026": 0.000751, "vq_loss_layer_027": 0.000893, "vq_loss_layer_028": 0.00145, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.003494, "vq_loss_layer_031": 0.007202 }, { "ce_loss": 2.299027, "epoch": 0.00447, "grad_norm": 0.003290798282250762, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.05582, "kv_vq_loss": 0.000614, "learning_rate": 0.000912576880782984, "loss": 0.056403, "step": 4470, "value_mse_loss_layer_000": 0.000706, "value_mse_loss_layer_001": 0.002182, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.012878, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.02356, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.023071, "value_mse_loss_layer_014": 0.024292, "value_mse_loss_layer_015": 0.0271, "value_mse_loss_layer_016": 0.022827, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.041748, "value_mse_loss_layer_020": 0.030762, "value_mse_loss_layer_021": 0.036621, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.044189, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000225, "vq_loss_layer_008": 0.000221, "vq_loss_layer_009": 0.000265, "vq_loss_layer_010": 0.00023, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000467, "vq_loss_layer_017": 0.000416, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.000292, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000387, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.00473 }, { "ce_loss": 2.289741, "epoch": 0.00448, "grad_norm": 0.0027630161494016647, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.055597, "kv_vq_loss": 0.000597, "learning_rate": 0.0009128195034995359, "loss": 0.056165, "step": 4480, "value_mse_loss_layer_000": 0.00071, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.013184, "value_mse_loss_layer_004": 0.01178, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.013794, "value_mse_loss_layer_007": 0.015137, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.023926, "value_mse_loss_layer_010": 0.022583, "value_mse_loss_layer_011": 0.02124, "value_mse_loss_layer_012": 0.02124, "value_mse_loss_layer_013": 0.026489, "value_mse_loss_layer_014": 0.023926, "value_mse_loss_layer_015": 0.027344, "value_mse_loss_layer_016": 0.022949, "value_mse_loss_layer_017": 0.026978, "value_mse_loss_layer_018": 0.025146, "value_mse_loss_layer_019": 0.03125, "value_mse_loss_layer_020": 0.030396, "value_mse_loss_layer_021": 0.034912, "value_mse_loss_layer_022": 0.036133, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.042236, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.044678, "value_mse_loss_layer_027": 0.057373, "value_mse_loss_layer_028": 0.061279, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.086426, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000143, "vq_loss_layer_007": 0.000231, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000307, "vq_loss_layer_010": 0.000294, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000444, "vq_loss_layer_013": 0.000526, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000488, "vq_loss_layer_016": 0.000467, "vq_loss_layer_017": 0.000443, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000238, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000425, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.000359, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.004944 }, { "ce_loss": 2.305408, "epoch": 0.00449, "grad_norm": 0.004636569879949093, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.055591, "kv_vq_loss": 0.000594, "learning_rate": 0.0009130615852508307, "loss": 0.05614, "step": 4490, "value_mse_loss_layer_000": 0.000698, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.00824, "value_mse_loss_layer_003": 0.013245, "value_mse_loss_layer_004": 0.012268, "value_mse_loss_layer_005": 0.012024, "value_mse_loss_layer_006": 0.015076, "value_mse_loss_layer_007": 0.016113, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.025391, "value_mse_loss_layer_010": 0.021484, "value_mse_loss_layer_011": 0.021484, "value_mse_loss_layer_012": 0.023438, "value_mse_loss_layer_013": 0.024658, "value_mse_loss_layer_014": 0.029541, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.027466, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.028564, "value_mse_loss_layer_020": 0.030762, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.042969, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.044434, "value_mse_loss_layer_027": 0.056152, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.085449, "value_mse_loss_layer_030": 0.095703, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000203, "vq_loss_layer_007": 0.000248, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000368, "vq_loss_layer_010": 0.000334, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000546, "vq_loss_layer_013": 0.000425, "vq_loss_layer_014": 0.000702, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.000219, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000458, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000519, "vq_loss_layer_024": 0.000391, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000664, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001389, "vq_loss_layer_030": 0.00293, "vq_loss_layer_031": 0.005707 }, { "ce_loss": 2.308544, "epoch": 0.0045, "grad_norm": 0.004052004776895046, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.05582, "kv_vq_loss": 0.000635, "learning_rate": 0.0009133031284438358, "loss": 0.056436, "step": 4500, "value_mse_loss_layer_000": 0.000706, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.015747, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.013611, "value_mse_loss_layer_007": 0.014893, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.021606, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.025513, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.023682, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.024536, "value_mse_loss_layer_019": 0.028198, "value_mse_loss_layer_020": 0.029907, "value_mse_loss_layer_021": 0.050293, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.052246, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.061279, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.091797, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000309, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000664, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.214999, "epoch": 0.00451, "grad_norm": 0.003196069970726967, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.055908, "key_mse_loss_layer_006": 0.062012, "key_mse_loss_layer_007": 0.070801, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.081543, "key_mse_loss_layer_020": 0.088379, "key_mse_loss_layer_021": 0.083984, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.058838, "kv_mse_loss": 0.055768, "kv_vq_loss": 0.00064, "learning_rate": 0.0009135441354694901, "loss": 0.056387, "step": 4510, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.01355, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.011963, "value_mse_loss_layer_006": 0.013916, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.019043, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.023682, "value_mse_loss_layer_011": 0.02124, "value_mse_loss_layer_012": 0.023071, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.027222, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.029175, "value_mse_loss_layer_020": 0.028809, "value_mse_loss_layer_021": 0.039062, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.045166, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.060791, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.091309, "value_mse_loss_layer_030": 0.083984, "value_mse_loss_layer_031": 0.09375, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000166, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000345, "vq_loss_layer_010": 0.000385, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000401, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000576, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000218, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.001877, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.007446 }, { "ce_loss": 2.292364, "epoch": 0.00452, "grad_norm": 0.003153494792059064, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.055606, "kv_vq_loss": 0.000599, "learning_rate": 0.0009137846087028455, "loss": 0.056174, "step": 4520, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.013123, "value_mse_loss_layer_005": 0.011719, "value_mse_loss_layer_006": 0.013916, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.02356, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.021973, "value_mse_loss_layer_013": 0.02356, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.026367, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.040771, "value_mse_loss_layer_021": 0.036865, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.04126, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.041992, "value_mse_loss_layer_027": 0.055176, "value_mse_loss_layer_028": 0.059814, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.091309, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000235, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000277, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000448, "vq_loss_layer_013": 0.000395, "vq_loss_layer_014": 0.000534, "vq_loss_layer_015": 0.000483, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000488, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.000389, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.006348 }, { "ce_loss": 2.27012, "epoch": 0.00453, "grad_norm": 0.0037299024406820536, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.056, "kv_vq_loss": 0.000624, "learning_rate": 0.0009140245505032077, "loss": 0.056613, "step": 4530, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012573, "value_mse_loss_layer_004": 0.012268, "value_mse_loss_layer_005": 0.011841, "value_mse_loss_layer_006": 0.014648, "value_mse_loss_layer_007": 0.015869, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.023926, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.021606, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.02356, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.027588, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.024048, "value_mse_loss_layer_019": 0.028198, "value_mse_loss_layer_020": 0.029663, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.050049, "value_mse_loss_layer_026": 0.057861, "value_mse_loss_layer_027": 0.054932, "value_mse_loss_layer_028": 0.057861, "value_mse_loss_layer_029": 0.080078, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.088379, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000205, "vq_loss_layer_007": 0.000256, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000307, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000648, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000233, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.00116, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001465, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.006226 }, { "ce_loss": 2.299783, "epoch": 0.00454, "grad_norm": 0.0027907434850931168, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.05589, "kv_vq_loss": 0.000617, "learning_rate": 0.0009142639632142759, "loss": 0.056482, "step": 4540, "value_mse_loss_layer_000": 0.000702, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.013428, "value_mse_loss_layer_004": 0.012024, "value_mse_loss_layer_005": 0.012512, "value_mse_loss_layer_006": 0.013916, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.018311, "value_mse_loss_layer_009": 0.024048, "value_mse_loss_layer_010": 0.020508, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.022217, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.026001, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.028198, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.028442, "value_mse_loss_layer_020": 0.030029, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.037598, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.047119, "value_mse_loss_layer_027": 0.056885, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.081055, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 0.00011, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000213, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.000244, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.00046, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000462, "vq_loss_layer_017": 0.00061, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000205, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000446, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000633, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.001808, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.005859 }, { "ce_loss": 2.341232, "epoch": 0.00455, "grad_norm": 0.003017499577254057, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.123535, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.116699, "key_mse_loss_layer_019": 0.097656, "key_mse_loss_layer_020": 0.109375, "key_mse_loss_layer_021": 0.102051, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.106445, "key_mse_loss_layer_024": 0.084961, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.09668, "key_mse_loss_layer_027": 0.095703, "key_mse_loss_layer_028": 0.102539, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.098145, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.055215, "kv_vq_loss": 0.000596, "learning_rate": 0.0009145028491642779, "loss": 0.055786, "step": 4550, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.013062, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.012329, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015747, "value_mse_loss_layer_008": 0.018799, "value_mse_loss_layer_009": 0.02356, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.023804, "value_mse_loss_layer_014": 0.024536, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.022461, "value_mse_loss_layer_017": 0.025635, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.029785, "value_mse_loss_layer_021": 0.035645, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.046631, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.049805, "value_mse_loss_layer_026": 0.047119, "value_mse_loss_layer_027": 0.060791, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.090332, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000228, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000324, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000463, "vq_loss_layer_013": 0.000458, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000273, "vq_loss_layer_019": 0.000269, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000507, "vq_loss_layer_024": 0.000395, "vq_loss_layer_025": 0.000519, "vq_loss_layer_026": 0.000744, "vq_loss_layer_027": 0.000782, "vq_loss_layer_028": 0.001366, "vq_loss_layer_029": 0.002274, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.006531 }, { "ce_loss": 2.29658, "epoch": 0.00456, "grad_norm": 0.004476854111999273, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.055267, "kv_vq_loss": 0.000609, "learning_rate": 0.0009147412106661087, "loss": 0.055865, "step": 4560, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.013855, "value_mse_loss_layer_004": 0.011963, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.015137, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.022705, "value_mse_loss_layer_010": 0.019897, "value_mse_loss_layer_011": 0.020874, "value_mse_loss_layer_012": 0.021729, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.02417, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.021118, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.028809, "value_mse_loss_layer_021": 0.041992, "value_mse_loss_layer_022": 0.036621, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.05127, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.089355, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.091797, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000233, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000301, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000557, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000234, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000736, "vq_loss_layer_022": 0.000427, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.001587, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.006317 }, { "ce_loss": 2.311315, "epoch": 0.00457, "grad_norm": 0.0036817125510424376, "key_mse_loss_layer_000": 0.002625, "key_mse_loss_layer_001": 0.009583, "key_mse_loss_layer_002": 0.05127, "key_mse_loss_layer_003": 0.043213, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.06543, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.071777, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.075195, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.055493, "kv_vq_loss": 0.0006, "learning_rate": 0.0009149790500174624, "loss": 0.056058, "step": 4570, "value_mse_loss_layer_000": 0.000702, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.00885, "value_mse_loss_layer_003": 0.01416, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.012146, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.016113, "value_mse_loss_layer_008": 0.020508, "value_mse_loss_layer_009": 0.026367, "value_mse_loss_layer_010": 0.02124, "value_mse_loss_layer_011": 0.023315, "value_mse_loss_layer_012": 0.022583, "value_mse_loss_layer_013": 0.025391, "value_mse_loss_layer_014": 0.026001, "value_mse_loss_layer_015": 0.027954, "value_mse_loss_layer_016": 0.024048, "value_mse_loss_layer_017": 0.028076, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.032715, "value_mse_loss_layer_020": 0.030151, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.042969, "value_mse_loss_layer_027": 0.057861, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.083496, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.00015, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.000366, "vq_loss_layer_009": 0.000374, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.000359, "vq_loss_layer_012": 0.000484, "vq_loss_layer_013": 0.000444, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000488, "vq_loss_layer_017": 0.000446, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000261, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000463, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000698, "vq_loss_layer_028": 0.000854, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.005463 }, { "ce_loss": 2.238668, "epoch": 0.00458, "grad_norm": 0.003387728938832879, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.055594, "kv_vq_loss": 0.000601, "learning_rate": 0.0009152163695009672, "loss": 0.056165, "step": 4580, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.013611, "value_mse_loss_layer_005": 0.011719, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.014954, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.020752, "value_mse_loss_layer_012": 0.021362, "value_mse_loss_layer_013": 0.022217, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.025024, "value_mse_loss_layer_016": 0.022339, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.02478, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.029541, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.042969, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.044922, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.058838, "value_mse_loss_layer_029": 0.083984, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 0.000153, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000164, "vq_loss_layer_007": 0.000227, "vq_loss_layer_008": 0.000267, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000477, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000444, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.241898, "epoch": 0.00459, "grad_norm": 0.0032746002543717623, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.05542, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07373, "key_mse_loss_layer_030": 0.070801, "key_mse_loss_layer_031": 0.055176, "kv_mse_loss": 0.055438, "kv_vq_loss": 0.000596, "learning_rate": 0.0009154531713843153, "loss": 0.056009, "step": 4590, "value_mse_loss_layer_000": 0.000732, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.013, "value_mse_loss_layer_004": 0.012756, "value_mse_loss_layer_005": 0.012146, "value_mse_loss_layer_006": 0.014587, "value_mse_loss_layer_007": 0.016357, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.022095, "value_mse_loss_layer_013": 0.023438, "value_mse_loss_layer_014": 0.02417, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.027344, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.0271, "value_mse_loss_layer_020": 0.030762, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.042969, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.061768, "value_mse_loss_layer_029": 0.077148, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.088867, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 5e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000185, "vq_loss_layer_007": 0.000263, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000301, "vq_loss_layer_010": 0.000328, "vq_loss_layer_011": 0.000341, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000538, "vq_loss_layer_014": 0.000572, "vq_loss_layer_015": 0.000629, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000469, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.000224, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000599, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000343, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.00046, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.001282, "vq_loss_layer_029": 0.001495, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.006989 }, { "ce_loss": 2.266458, "epoch": 0.0046, "grad_norm": 0.004439635667949915, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.055301, "kv_vq_loss": 0.000596, "learning_rate": 0.0009156894579203935, "loss": 0.055869, "step": 4600, "value_mse_loss_layer_000": 0.000698, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.013245, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.015503, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.022949, "value_mse_loss_layer_010": 0.02063, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.02124, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.024902, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.025879, "value_mse_loss_layer_018": 0.029297, "value_mse_loss_layer_019": 0.028198, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.060303, "value_mse_loss_layer_029": 0.084473, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.083984, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000271, "vq_loss_layer_012": 0.000467, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000341, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.00037, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.005096 }, { "ce_loss": 2.284102, "epoch": 0.00461, "grad_norm": 0.00341276777908206, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.055301, "kv_vq_loss": 0.000596, "learning_rate": 0.0009159252313474119, "loss": 0.055872, "step": 4610, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.007782, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.014832, "value_mse_loss_layer_008": 0.018066, "value_mse_loss_layer_009": 0.025024, "value_mse_loss_layer_010": 0.020264, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.022461, "value_mse_loss_layer_017": 0.027588, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.030396, "value_mse_loss_layer_020": 0.029907, "value_mse_loss_layer_021": 0.036621, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.037598, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.057373, "value_mse_loss_layer_029": 0.079102, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000213, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000364, "vq_loss_layer_010": 0.000234, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000448, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000427, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.004852 }, { "ce_loss": 2.297907, "epoch": 0.00462, "grad_norm": 0.004045460838824511, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.055487, "kv_vq_loss": 0.000606, "learning_rate": 0.0009161604938890311, "loss": 0.056049, "step": 4620, "value_mse_loss_layer_000": 0.000698, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.013123, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.011475, "value_mse_loss_layer_006": 0.014465, "value_mse_loss_layer_007": 0.015198, "value_mse_loss_layer_008": 0.018555, "value_mse_loss_layer_009": 0.023193, "value_mse_loss_layer_010": 0.019653, "value_mse_loss_layer_011": 0.020752, "value_mse_loss_layer_012": 0.023926, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.02356, "value_mse_loss_layer_015": 0.026978, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.026855, "value_mse_loss_layer_018": 0.024048, "value_mse_loss_layer_019": 0.027832, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.052979, "value_mse_loss_layer_022": 0.034668, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.04541, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.057373, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000178, "vq_loss_layer_007": 0.000229, "vq_loss_layer_008": 0.000229, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000234, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000622, "vq_loss_layer_013": 0.000389, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000469, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000402, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000725, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.00036, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.316093, "epoch": 0.00463, "grad_norm": 0.003600550116971135, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.109375, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.115723, "key_mse_loss_layer_019": 0.100586, "key_mse_loss_layer_020": 0.111816, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.104492, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.095215, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.055417, "kv_vq_loss": 0.000618, "learning_rate": 0.0009163952477544881, "loss": 0.056024, "step": 4630, "value_mse_loss_layer_000": 0.000664, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.00824, "value_mse_loss_layer_003": 0.01355, "value_mse_loss_layer_004": 0.013184, "value_mse_loss_layer_005": 0.012329, "value_mse_loss_layer_006": 0.014709, "value_mse_loss_layer_007": 0.016235, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.024658, "value_mse_loss_layer_010": 0.021973, "value_mse_loss_layer_011": 0.022339, "value_mse_loss_layer_012": 0.022705, "value_mse_loss_layer_013": 0.024536, "value_mse_loss_layer_014": 0.024414, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.022461, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.024048, "value_mse_loss_layer_019": 0.02832, "value_mse_loss_layer_020": 0.0354, "value_mse_loss_layer_021": 0.039062, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.045654, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.055908, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.08252, "value_mse_loss_layer_030": 0.088867, "value_mse_loss_layer_031": 0.092285, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 5.2e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 0.000121, "vq_loss_layer_006": 0.000193, "vq_loss_layer_007": 0.000261, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.000368, "vq_loss_layer_011": 0.000376, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.000425, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000622, "vq_loss_layer_016": 0.000523, "vq_loss_layer_017": 0.000458, "vq_loss_layer_018": 0.000286, "vq_loss_layer_019": 0.000265, "vq_loss_layer_020": 0.000452, "vq_loss_layer_021": 0.000767, "vq_loss_layer_022": 0.00041, "vq_loss_layer_023": 0.000629, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000641, "vq_loss_layer_026": 0.000786, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.003113, "vq_loss_layer_031": 0.007324 }, { "ce_loss": 2.325709, "epoch": 0.00464, "grad_norm": 0.0044095576740801334, "key_mse_loss_layer_000": 0.003967, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.083008, "kv_mse_loss": 0.055228, "kv_vq_loss": 0.000601, "learning_rate": 0.00091662949513872, "loss": 0.055814, "step": 4640, "value_mse_loss_layer_000": 0.000706, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.012451, "value_mse_loss_layer_004": 0.01178, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.015015, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.023193, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.022095, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.025146, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.025513, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.025635, "value_mse_loss_layer_019": 0.028076, "value_mse_loss_layer_020": 0.030151, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.0354, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.042236, "value_mse_loss_layer_025": 0.050781, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.067871, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.092773, "value_mse_loss_layer_030": 0.07959, "value_mse_loss_layer_031": 0.086914, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000151, "vq_loss_layer_007": 0.000225, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000286, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000471, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000549, "vq_loss_layer_017": 0.000374, "vq_loss_layer_018": 0.000282, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000402, "vq_loss_layer_022": 0.000317, "vq_loss_layer_023": 0.000343, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000378, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000938, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.005615 }, { "ce_loss": 2.295589, "epoch": 0.00465, "grad_norm": 0.0028840687591582537, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.092773, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.105957, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.092773, "key_mse_loss_layer_016": 0.084473, "key_mse_loss_layer_017": 0.087402, "key_mse_loss_layer_018": 0.093262, "key_mse_loss_layer_019": 0.080566, "key_mse_loss_layer_020": 0.087891, "key_mse_loss_layer_021": 0.083496, "key_mse_loss_layer_022": 0.085449, "key_mse_loss_layer_023": 0.083496, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.055057, "kv_vq_loss": 0.000588, "learning_rate": 0.0009168632382224884, "loss": 0.055621, "step": 4650, "value_mse_loss_layer_000": 0.000706, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.013123, "value_mse_loss_layer_004": 0.012268, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.015137, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022461, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.019897, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.022217, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.031006, "value_mse_loss_layer_020": 0.027588, "value_mse_loss_layer_021": 0.033203, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.04126, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.056152, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000233, "vq_loss_layer_008": 0.000227, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000595, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000256, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.00045, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000338, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000801, "vq_loss_layer_027": 0.000732, "vq_loss_layer_028": 0.0009, "vq_loss_layer_029": 0.001358, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.005829 }, { "ce_loss": 2.304883, "epoch": 0.00466, "grad_norm": 0.005330196116119623, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.142578, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.122559, "key_mse_loss_layer_016": 0.120605, "key_mse_loss_layer_017": 0.119141, "key_mse_loss_layer_018": 0.12793, "key_mse_loss_layer_019": 0.097168, "key_mse_loss_layer_020": 0.111816, "key_mse_loss_layer_021": 0.107422, "key_mse_loss_layer_022": 0.112793, "key_mse_loss_layer_023": 0.111816, "key_mse_loss_layer_024": 0.089355, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.099121, "key_mse_loss_layer_027": 0.09375, "key_mse_loss_layer_028": 0.102051, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.101074, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.055081, "kv_vq_loss": 0.000604, "learning_rate": 0.0009170964791724999, "loss": 0.055673, "step": 4660, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.009216, "value_mse_loss_layer_003": 0.013123, "value_mse_loss_layer_004": 0.012207, "value_mse_loss_layer_005": 0.011658, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.022949, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.020874, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.02356, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.020386, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.027466, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.029419, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.066895, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.094238, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.8e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000328, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000324, "vq_loss_layer_011": 0.000355, "vq_loss_layer_012": 0.000534, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.00045, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000273, "vq_loss_layer_019": 0.000226, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000433, "vq_loss_layer_024": 0.000473, "vq_loss_layer_025": 0.001022, "vq_loss_layer_026": 0.000847, "vq_loss_layer_027": 0.000793, "vq_loss_layer_028": 0.001427, "vq_loss_layer_029": 0.001457, "vq_loss_layer_030": 0.004822, "vq_loss_layer_031": 0.007935 }, { "ce_loss": 2.281917, "epoch": 0.00467, "grad_norm": 0.003685602219775319, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.055179, "kv_vq_loss": 0.000611, "learning_rate": 0.0009173292201415279, "loss": 0.055759, "step": 4670, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.013, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.012939, "value_mse_loss_layer_006": 0.013977, "value_mse_loss_layer_007": 0.015198, "value_mse_loss_layer_008": 0.018799, "value_mse_loss_layer_009": 0.02417, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.021362, "value_mse_loss_layer_012": 0.022217, "value_mse_loss_layer_013": 0.02356, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.028687, "value_mse_loss_layer_016": 0.022827, "value_mse_loss_layer_017": 0.026855, "value_mse_loss_layer_018": 0.02478, "value_mse_loss_layer_019": 0.028076, "value_mse_loss_layer_020": 0.03064, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.034668, "value_mse_loss_layer_023": 0.039062, "value_mse_loss_layer_024": 0.042725, "value_mse_loss_layer_025": 0.057373, "value_mse_loss_layer_026": 0.04834, "value_mse_loss_layer_027": 0.055908, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.084961, "value_mse_loss_layer_030": 0.080078, "value_mse_loss_layer_031": 0.086914, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 0.000119, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000259, "vq_loss_layer_009": 0.000315, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000471, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000469, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000603, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000372, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000444, "vq_loss_layer_026": 0.000721, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001366, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.005676 }, { "ce_loss": 2.357136, "epoch": 0.00468, "grad_norm": 0.003249072702601552, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.054779, "kv_vq_loss": 0.000571, "learning_rate": 0.0009175614632685309, "loss": 0.055313, "step": 4680, "value_mse_loss_layer_000": 0.000702, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.01178, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.014526, "value_mse_loss_layer_008": 0.0177, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.020508, "value_mse_loss_layer_012": 0.022095, "value_mse_loss_layer_013": 0.022217, "value_mse_loss_layer_014": 0.024658, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.024048, "value_mse_loss_layer_017": 0.025879, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.02771, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.034912, "value_mse_loss_layer_022": 0.033447, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.042725, "value_mse_loss_layer_025": 0.050537, "value_mse_loss_layer_026": 0.045898, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.089844, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000265, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000483, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000456, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000515, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000402, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005402 }, { "ce_loss": 2.307703, "epoch": 0.00469, "grad_norm": 0.004144272301346064, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.055176, "kv_vq_loss": 0.000583, "learning_rate": 0.0009177932106787707, "loss": 0.055734, "step": 4690, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.00824, "value_mse_loss_layer_003": 0.014404, "value_mse_loss_layer_004": 0.011902, "value_mse_loss_layer_005": 0.011475, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.020508, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.025024, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.022339, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.025146, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.030762, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.035645, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.053223, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.095703, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000229, "vq_loss_layer_008": 0.00024, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000467, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000423, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.001328, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.282579, "epoch": 0.0047, "grad_norm": 0.005303672049194574, "key_mse_loss_layer_000": 0.00412, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.05549, "kv_vq_loss": 0.000602, "learning_rate": 0.0009180244644839292, "loss": 0.056073, "step": 4700, "value_mse_loss_layer_000": 0.000729, "value_mse_loss_layer_001": 0.002167, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.013367, "value_mse_loss_layer_004": 0.01239, "value_mse_loss_layer_005": 0.011658, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.014954, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.02356, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.024902, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.026978, "value_mse_loss_layer_016": 0.022461, "value_mse_loss_layer_017": 0.026733, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.028442, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.042969, "value_mse_loss_layer_022": 0.035889, "value_mse_loss_layer_023": 0.042236, "value_mse_loss_layer_024": 0.041504, "value_mse_loss_layer_025": 0.085938, "value_mse_loss_layer_026": 0.046387, "value_mse_loss_layer_027": 0.061279, "value_mse_loss_layer_028": 0.061279, "value_mse_loss_layer_029": 0.086914, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.086914, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000157, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.000265, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000254, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000679, "vq_loss_layer_013": 0.000397, "vq_loss_layer_014": 0.000452, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000467, "vq_loss_layer_017": 0.00046, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000641, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.000751, "vq_loss_layer_026": 0.000687, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001457, "vq_loss_layer_030": 0.003311, "vq_loss_layer_031": 0.005249 }, { "ce_loss": 2.318319, "epoch": 0.00471, "grad_norm": 0.0033434280194342136, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.055276, "kv_vq_loss": 0.000589, "learning_rate": 0.000918255226782224, "loss": 0.055835, "step": 4710, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015137, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.023682, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.020752, "value_mse_loss_layer_012": 0.021362, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.026611, "value_mse_loss_layer_016": 0.022583, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.025146, "value_mse_loss_layer_019": 0.027832, "value_mse_loss_layer_020": 0.032959, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.043701, "value_mse_loss_layer_025": 0.054199, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.059814, "value_mse_loss_layer_028": 0.060547, "value_mse_loss_layer_029": 0.085938, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.088867, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000223, "vq_loss_layer_008": 0.000227, "vq_loss_layer_009": 0.000313, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000496, "vq_loss_layer_017": 0.000401, "vq_loss_layer_018": 0.00029, "vq_loss_layer_019": 0.000214, "vq_loss_layer_020": 0.000309, "vq_loss_layer_021": 0.000477, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000433, "vq_loss_layer_025": 0.000414, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001343, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.005432 }, { "ce_loss": 2.259989, "epoch": 0.00472, "grad_norm": 0.004067239351570606, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.055023, "kv_vq_loss": 0.000602, "learning_rate": 0.0009184854996585219, "loss": 0.055609, "step": 4720, "value_mse_loss_layer_000": 0.000675, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.015625, "value_mse_loss_layer_004": 0.012024, "value_mse_loss_layer_005": 0.011963, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.015015, "value_mse_loss_layer_008": 0.018921, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.02124, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.023193, "value_mse_loss_layer_014": 0.025635, "value_mse_loss_layer_015": 0.027222, "value_mse_loss_layer_016": 0.02417, "value_mse_loss_layer_017": 0.026855, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.027954, "value_mse_loss_layer_020": 0.031738, "value_mse_loss_layer_021": 0.036377, "value_mse_loss_layer_022": 0.037598, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.04126, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000229, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.000301, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000587, "vq_loss_layer_016": 0.000542, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000383, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001389, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.005188 }, { "ce_loss": 2.271888, "epoch": 0.00473, "grad_norm": 0.0038253720849752426, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.055249, "kv_vq_loss": 0.000592, "learning_rate": 0.0009187152851844528, "loss": 0.055817, "step": 4730, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.013184, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.01532, "value_mse_loss_layer_007": 0.014832, "value_mse_loss_layer_008": 0.018066, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.021851, "value_mse_loss_layer_012": 0.023438, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.026367, "value_mse_loss_layer_016": 0.020874, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.024292, "value_mse_loss_layer_019": 0.026733, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.034668, "value_mse_loss_layer_023": 0.04126, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.041504, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.07959, "value_mse_loss_layer_030": 0.078613, "value_mse_loss_layer_031": 0.082031, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000221, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000242, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.000383, "vq_loss_layer_012": 0.000687, "vq_loss_layer_013": 0.00037, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.00042, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000433, "vq_loss_layer_022": 0.00034, "vq_loss_layer_023": 0.000458, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.003448, "vq_loss_layer_031": 0.004669 }, { "ce_loss": 2.300779, "epoch": 0.00474, "grad_norm": 0.002940315520390868, "key_mse_loss_layer_000": 0.003647, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.055643, "kv_vq_loss": 0.000629, "learning_rate": 0.0009189445854185212, "loss": 0.05625, "step": 4740, "value_mse_loss_layer_000": 0.00071, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.008301, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.011658, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015259, "value_mse_loss_layer_008": 0.018311, "value_mse_loss_layer_009": 0.025635, "value_mse_loss_layer_010": 0.019897, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.023438, "value_mse_loss_layer_014": 0.026978, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.027832, "value_mse_loss_layer_018": 0.024048, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.029663, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.055176, "value_mse_loss_layer_028": 0.059326, "value_mse_loss_layer_029": 0.07959, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000173, "vq_loss_layer_007": 0.000233, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000431, "vq_loss_layer_010": 0.000238, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000479, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000622, "vq_loss_layer_015": 0.000557, "vq_loss_layer_016": 0.000425, "vq_loss_layer_017": 0.00053, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.000232, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000492, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000454, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.000675, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.005432 }, { "ce_loss": 2.236036, "epoch": 0.00475, "grad_norm": 0.003115186234936118, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.055426, "kv_vq_loss": 0.000593, "learning_rate": 0.0009191734024062165, "loss": 0.055994, "step": 4750, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.011719, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.014771, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.021851, "value_mse_loss_layer_013": 0.023926, "value_mse_loss_layer_014": 0.024658, "value_mse_loss_layer_015": 0.027344, "value_mse_loss_layer_016": 0.028076, "value_mse_loss_layer_017": 0.026367, "value_mse_loss_layer_018": 0.024292, "value_mse_loss_layer_019": 0.02771, "value_mse_loss_layer_020": 0.032471, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.036621, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.044189, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.057861, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000267, "vq_loss_layer_010": 0.00024, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000444, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000626, "vq_loss_layer_017": 0.000401, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000376, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000675, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.005463 }, { "ce_loss": 2.21716, "epoch": 0.00476, "grad_norm": 0.0032936965581029654, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.05589, "kv_vq_loss": 0.000626, "learning_rate": 0.0009194017381801232, "loss": 0.056488, "step": 4760, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.00824, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.01178, "value_mse_loss_layer_005": 0.01178, "value_mse_loss_layer_006": 0.013611, "value_mse_loss_layer_007": 0.014954, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.02124, "value_mse_loss_layer_012": 0.021851, "value_mse_loss_layer_013": 0.023682, "value_mse_loss_layer_014": 0.025024, "value_mse_loss_layer_015": 0.026611, "value_mse_loss_layer_016": 0.023438, "value_mse_loss_layer_017": 0.025635, "value_mse_loss_layer_018": 0.026489, "value_mse_loss_layer_019": 0.0271, "value_mse_loss_layer_020": 0.030273, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.041504, "value_mse_loss_layer_025": 0.052979, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.07959, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.086914, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000309, "vq_loss_layer_012": 0.000456, "vq_loss_layer_013": 0.000452, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000565, "vq_loss_layer_017": 0.000431, "vq_loss_layer_018": 0.000374, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.000526, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000402, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000671, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.00592 }, { "ce_loss": 2.301195, "epoch": 0.00477, "grad_norm": 0.003830628003925085, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.056, "kv_vq_loss": 0.000618, "learning_rate": 0.0009196295947600284, "loss": 0.056592, "step": 4770, "value_mse_loss_layer_000": 0.000702, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.022705, "value_mse_loss_layer_010": 0.020996, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.025146, "value_mse_loss_layer_015": 0.026855, "value_mse_loss_layer_016": 0.022217, "value_mse_loss_layer_017": 0.027222, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.029419, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.035645, "value_mse_loss_layer_023": 0.04126, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.049072, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.054932, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.086914, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.081543, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000452, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.000429, "vq_loss_layer_017": 0.000507, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.00023, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000343, "vq_loss_layer_023": 0.000393, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.00069, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000969, "vq_loss_layer_029": 0.001167, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.267495, "epoch": 0.00478, "grad_norm": 0.002759776543825865, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.055435, "kv_vq_loss": 0.000595, "learning_rate": 0.0009198569741530297, "loss": 0.055997, "step": 4780, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.013916, "value_mse_loss_layer_007": 0.014832, "value_mse_loss_layer_008": 0.018433, "value_mse_loss_layer_009": 0.023315, "value_mse_loss_layer_010": 0.020386, "value_mse_loss_layer_011": 0.020874, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.025024, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.022461, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.029541, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.038574, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.058838, "value_mse_loss_layer_029": 0.077148, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.081055, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 0.00013, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000218, "vq_loss_layer_008": 0.000222, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000271, "vq_loss_layer_012": 0.000471, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000515, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.000427, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.00024, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000443, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.001274, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.004608 }, { "ce_loss": 2.272341, "epoch": 0.00479, "grad_norm": 0.0034289690665900707, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.011475, "key_mse_loss_layer_002": 0.06543, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.066895, "key_mse_loss_layer_006": 0.07373, "key_mse_loss_layer_007": 0.081055, "key_mse_loss_layer_008": 0.094727, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.114258, "key_mse_loss_layer_011": 0.10791, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.142578, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.126953, "key_mse_loss_layer_016": 0.126953, "key_mse_loss_layer_017": 0.125, "key_mse_loss_layer_018": 0.137695, "key_mse_loss_layer_019": 0.114746, "key_mse_loss_layer_020": 0.130859, "key_mse_loss_layer_021": 0.12207, "key_mse_loss_layer_022": 0.131836, "key_mse_loss_layer_023": 0.143555, "key_mse_loss_layer_024": 0.114258, "key_mse_loss_layer_025": 0.11084, "key_mse_loss_layer_026": 0.132812, "key_mse_loss_layer_027": 0.126953, "key_mse_loss_layer_028": 0.141602, "key_mse_loss_layer_029": 0.123047, "key_mse_loss_layer_030": 0.132812, "key_mse_loss_layer_031": 0.098633, "kv_mse_loss": 0.055396, "kv_vq_loss": 0.000606, "learning_rate": 0.0009200838783536408, "loss": 0.055975, "step": 4790, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.013489, "value_mse_loss_layer_005": 0.012756, "value_mse_loss_layer_006": 0.013977, "value_mse_loss_layer_007": 0.015564, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.019409, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.02124, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.029785, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.040527, "value_mse_loss_layer_024": 0.047852, "value_mse_loss_layer_025": 0.05542, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.067871, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.106934, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 2.9e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 0.000109, "vq_loss_layer_005": 0.000117, "vq_loss_layer_006": 0.000161, "vq_loss_layer_007": 0.000231, "vq_loss_layer_008": 0.000239, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.000523, "vq_loss_layer_013": 0.000362, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000288, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.000572, "vq_loss_layer_026": 0.000816, "vq_loss_layer_027": 0.0009, "vq_loss_layer_028": 0.0019, "vq_loss_layer_029": 0.003235, "vq_loss_layer_030": 0.005402, "vq_loss_layer_031": 0.008972 }, { "ce_loss": 2.310268, "epoch": 0.0048, "grad_norm": 0.0034384073223918676, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.064453, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.055423, "kv_vq_loss": 0.000601, "learning_rate": 0.0009203103093438967, "loss": 0.056006, "step": 4800, "value_mse_loss_layer_000": 0.00071, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012329, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.01416, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.026978, "value_mse_loss_layer_016": 0.022583, "value_mse_loss_layer_017": 0.027588, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.02832, "value_mse_loss_layer_020": 0.031006, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.034668, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.050537, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.059814, "value_mse_loss_layer_028": 0.064941, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.085449, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.00023, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000223, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000359, "vq_loss_layer_014": 0.000483, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.00042, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.004669 }, { "ce_loss": 2.278183, "epoch": 0.00481, "grad_norm": 0.0040422226302325726, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.055524, "kv_vq_loss": 0.000595, "learning_rate": 0.0009205362690934578, "loss": 0.056097, "step": 4810, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.009338, "value_mse_loss_layer_003": 0.012512, "value_mse_loss_layer_004": 0.011292, "value_mse_loss_layer_005": 0.011108, "value_mse_loss_layer_006": 0.013489, "value_mse_loss_layer_007": 0.014832, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.023315, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.02124, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.025024, "value_mse_loss_layer_019": 0.028687, "value_mse_loss_layer_020": 0.030396, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.052246, "value_mse_loss_layer_026": 0.050781, "value_mse_loss_layer_027": 0.057617, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.092773, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.083008, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000143, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.000317, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.000454, "vq_loss_layer_013": 0.000372, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000427, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000404, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000683, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.004547 }, { "ce_loss": 2.265802, "epoch": 0.00482, "grad_norm": 0.003230678616091609, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.055371, "kv_vq_loss": 0.000594, "learning_rate": 0.0009207617595597122, "loss": 0.05593, "step": 4820, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.012146, "value_mse_loss_layer_005": 0.011475, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.014587, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.022705, "value_mse_loss_layer_010": 0.021606, "value_mse_loss_layer_011": 0.020508, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.024048, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.029419, "value_mse_loss_layer_021": 0.033936, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.037109, "value_mse_loss_layer_024": 0.047119, "value_mse_loss_layer_025": 0.053955, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.054932, "value_mse_loss_layer_028": 0.057861, "value_mse_loss_layer_029": 0.077148, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.086914, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 0.000102, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000212, "vq_loss_layer_008": 0.000246, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000526, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000439, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.000481, "vq_loss_layer_017": 0.00042, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000322, "vq_loss_layer_024": 0.000372, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000961, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.249733, "epoch": 0.00483, "grad_norm": 0.004695178009569645, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.055264, "kv_vq_loss": 0.00061, "learning_rate": 0.000920986782687878, "loss": 0.055859, "step": 4830, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.002136, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.012024, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.015564, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.022217, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.024414, "value_mse_loss_layer_014": 0.024902, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.023071, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.031494, "value_mse_loss_layer_020": 0.028564, "value_mse_loss_layer_021": 0.033936, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.056885, "value_mse_loss_layer_026": 0.044434, "value_mse_loss_layer_027": 0.05957, "value_mse_loss_layer_028": 0.061279, "value_mse_loss_layer_029": 0.109863, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000221, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000286, "vq_loss_layer_011": 0.00037, "vq_loss_layer_012": 0.000423, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000404, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000406, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000389, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.00177, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.330551, "epoch": 0.00484, "grad_norm": 0.003446438116952777, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.055182, "kv_vq_loss": 0.000591, "learning_rate": 0.000921211340411103, "loss": 0.055743, "step": 4840, "value_mse_loss_layer_000": 0.000664, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012878, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.015076, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.023682, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.021606, "value_mse_loss_layer_012": 0.021362, "value_mse_loss_layer_013": 0.023926, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.022705, "value_mse_loss_layer_017": 0.02771, "value_mse_loss_layer_018": 0.025146, "value_mse_loss_layer_019": 0.027832, "value_mse_loss_layer_020": 0.031128, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.039795, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.060547, "value_mse_loss_layer_029": 0.084961, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.085449, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000174, "vq_loss_layer_007": 0.000228, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.000238, "vq_loss_layer_011": 0.000294, "vq_loss_layer_012": 0.000452, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000479, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000479, "vq_loss_layer_018": 0.000292, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.00041, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000389, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.303137, "epoch": 0.00485, "grad_norm": 0.003794092684984207, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.055347, "kv_vq_loss": 0.000603, "learning_rate": 0.0009214354346505658, "loss": 0.055927, "step": 4850, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.011963, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.013916, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.0177, "value_mse_loss_layer_009": 0.023926, "value_mse_loss_layer_010": 0.020264, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.026367, "value_mse_loss_layer_013": 0.023071, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.027588, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.02832, "value_mse_loss_layer_018": 0.026367, "value_mse_loss_layer_019": 0.027954, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.044678, "value_mse_loss_layer_022": 0.037354, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.060059, "value_mse_loss_layer_029": 0.085449, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 0.000117, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000267, "vq_loss_layer_008": 0.000231, "vq_loss_layer_009": 0.000332, "vq_loss_layer_010": 0.000263, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000786, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000546, "vq_loss_layer_017": 0.000534, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000322, "vq_loss_layer_021": 0.00066, "vq_loss_layer_022": 0.000416, "vq_loss_layer_023": 0.000336, "vq_loss_layer_024": 0.000368, "vq_loss_layer_025": 0.000454, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.001404, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.301986, "epoch": 0.00486, "grad_norm": 0.0032739457674324512, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.055167, "kv_vq_loss": 0.000602, "learning_rate": 0.0009216590673155733, "loss": 0.05574, "step": 4860, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.013, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.021362, "value_mse_loss_layer_012": 0.021362, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.023071, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.027954, "value_mse_loss_layer_020": 0.027954, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.080078, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.081543, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000227, "vq_loss_layer_008": 0.000229, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000218, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000448, "vq_loss_layer_013": 0.000408, "vq_loss_layer_014": 0.000452, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000444, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.00473 }, { "ce_loss": 2.328288, "epoch": 0.00487, "grad_norm": 0.0036461728159338236, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010986, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.062256, "key_mse_loss_layer_005": 0.065918, "key_mse_loss_layer_006": 0.072754, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.095215, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.09668, "key_mse_loss_layer_030": 0.106445, "key_mse_loss_layer_031": 0.09082, "kv_mse_loss": 0.054807, "kv_vq_loss": 0.000582, "learning_rate": 0.0009218822403036584, "loss": 0.055365, "step": 4870, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.012573, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.011047, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.018311, "value_mse_loss_layer_011": 0.019653, "value_mse_loss_layer_012": 0.02002, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.02356, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.024902, "value_mse_loss_layer_018": 0.025391, "value_mse_loss_layer_019": 0.028809, "value_mse_loss_layer_020": 0.029419, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.041992, "value_mse_loss_layer_024": 0.05127, "value_mse_loss_layer_025": 0.057617, "value_mse_loss_layer_026": 0.050293, "value_mse_loss_layer_027": 0.059326, "value_mse_loss_layer_028": 0.062012, "value_mse_loss_layer_029": 0.086914, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.086914, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000174, "vq_loss_layer_007": 0.00023, "vq_loss_layer_008": 0.000221, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000435, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000345, "vq_loss_layer_019": 0.000213, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000336, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.000748, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.004852 }, { "ce_loss": 2.348009, "epoch": 0.00488, "grad_norm": 0.0036510941572487354, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.055011, "kv_vq_loss": 0.000597, "learning_rate": 0.0009221049555006776, "loss": 0.055585, "step": 4880, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.011963, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.013306, "value_mse_loss_layer_007": 0.015015, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.024048, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.027954, "value_mse_loss_layer_020": 0.02832, "value_mse_loss_layer_021": 0.032715, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.046387, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.056885, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.095215, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.000267, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000471, "vq_loss_layer_013": 0.000479, "vq_loss_layer_014": 0.000561, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000561, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000633, "vq_loss_layer_027": 0.000793, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001564, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.007751 }, { "ce_loss": 2.255367, "epoch": 0.00489, "grad_norm": 0.0028909510001540184, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.055399, "kv_vq_loss": 0.000614, "learning_rate": 0.0009223272147809049, "loss": 0.056, "step": 4890, "value_mse_loss_layer_000": 0.000721, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.014526, "value_mse_loss_layer_008": 0.018311, "value_mse_loss_layer_009": 0.022583, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.020142, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.021851, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.021851, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.028198, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.052246, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.072754, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 0.000116, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000178, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000467, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000462, "vq_loss_layer_017": 0.000492, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000423, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000931, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.301474, "epoch": 0.0049, "grad_norm": 0.0036247167736291885, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.055191, "kv_vq_loss": 0.000585, "learning_rate": 0.0009225490200071284, "loss": 0.055743, "step": 4900, "value_mse_loss_layer_000": 0.000698, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.012512, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.015076, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.022949, "value_mse_loss_layer_010": 0.019653, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.021973, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.028076, "value_mse_loss_layer_020": 0.037598, "value_mse_loss_layer_021": 0.035645, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.039307, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.059326, "value_mse_loss_layer_029": 0.089355, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.085449, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000222, "vq_loss_layer_008": 0.000246, "vq_loss_layer_009": 0.000278, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000483, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000439, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000456, "vq_loss_layer_017": 0.000443, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.000359, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000452, "vq_loss_layer_024": 0.000425, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000687, "vq_loss_layer_027": 0.000675, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.314595, "epoch": 0.00491, "grad_norm": 0.003197524230927229, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.054468, "kv_vq_loss": 0.000569, "learning_rate": 0.000922770373030742, "loss": 0.055017, "step": 4910, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012756, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.015747, "value_mse_loss_layer_008": 0.018677, "value_mse_loss_layer_009": 0.023682, "value_mse_loss_layer_010": 0.021484, "value_mse_loss_layer_011": 0.021362, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.027344, "value_mse_loss_layer_015": 0.027222, "value_mse_loss_layer_016": 0.022583, "value_mse_loss_layer_017": 0.026978, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.02832, "value_mse_loss_layer_020": 0.031494, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.037598, "value_mse_loss_layer_024": 0.040771, "value_mse_loss_layer_025": 0.053223, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.054932, "value_mse_loss_layer_028": 0.058594, "value_mse_loss_layer_029": 0.08252, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.090332, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000471, "vq_loss_layer_013": 0.000437, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000444, "vq_loss_layer_017": 0.000477, "vq_loss_layer_018": 0.000269, "vq_loss_layer_019": 0.000231, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.001358, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.006561 }, { "ce_loss": 2.326551, "epoch": 0.00492, "grad_norm": 0.0038969232700765133, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.055127, "kv_vq_loss": 0.000589, "learning_rate": 0.00092299127569184, "loss": 0.055688, "step": 4920, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.013367, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014221, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.023193, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.019897, "value_mse_loss_layer_013": 0.021973, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.021484, "value_mse_loss_layer_017": 0.025391, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.027466, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.033203, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.050293, "value_mse_loss_layer_026": 0.056396, "value_mse_loss_layer_027": 0.077637, "value_mse_loss_layer_028": 0.061768, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.083496, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.000218, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000422, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000862, "vq_loss_layer_027": 0.001053, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.281011, "epoch": 0.00493, "grad_norm": 0.004823822993785143, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.055814, "kv_vq_loss": 0.000613, "learning_rate": 0.0009232117298193075, "loss": 0.056412, "step": 4930, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012756, "value_mse_loss_layer_004": 0.012878, "value_mse_loss_layer_005": 0.01178, "value_mse_loss_layer_006": 0.013245, "value_mse_loss_layer_007": 0.01532, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.022949, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.022949, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.022949, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.03064, "value_mse_loss_layer_020": 0.028687, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.036865, "value_mse_loss_layer_023": 0.052002, "value_mse_loss_layer_024": 0.044434, "value_mse_loss_layer_025": 0.070312, "value_mse_loss_layer_026": 0.044922, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.083496, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.08252, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.00014, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.000223, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000561, "vq_loss_layer_013": 0.000372, "vq_loss_layer_014": 0.000439, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000462, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000213, "vq_loss_layer_021": 0.000414, "vq_loss_layer_022": 0.000359, "vq_loss_layer_023": 0.000534, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000484, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.004852 }, { "ce_loss": 2.31688, "epoch": 0.00494, "grad_norm": 0.0032598988618701696, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.061279, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.055267, "kv_vq_loss": 0.000587, "learning_rate": 0.0009234317372309117, "loss": 0.05582, "step": 4940, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.010986, "value_mse_loss_layer_006": 0.013489, "value_mse_loss_layer_007": 0.014221, "value_mse_loss_layer_008": 0.0177, "value_mse_loss_layer_009": 0.023315, "value_mse_loss_layer_010": 0.020874, "value_mse_loss_layer_011": 0.02124, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.024292, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.022339, "value_mse_loss_layer_017": 0.026001, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.02832, "value_mse_loss_layer_020": 0.031006, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.035156, "value_mse_loss_layer_023": 0.041504, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.051758, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.060059, "value_mse_loss_layer_029": 0.088379, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.083496, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 0.000102, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000215, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.000317, "vq_loss_layer_012": 0.000546, "vq_loss_layer_013": 0.00037, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.000239, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.000416, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.00034, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.284926, "epoch": 0.00495, "grad_norm": 0.003594006644561887, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.041016, "key_mse_loss_layer_005": 0.054443, "key_mse_loss_layer_006": 0.062012, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.055264, "kv_vq_loss": 0.000593, "learning_rate": 0.000923651299733392, "loss": 0.055829, "step": 4950, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.014648, "value_mse_loss_layer_004": 0.012268, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.014648, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.021118, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.029297, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.037109, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.057373, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 5.5e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000151, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.00046, "vq_loss_layer_013": 0.000389, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000222, "vq_loss_layer_020": 0.000286, "vq_loss_layer_021": 0.000463, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000336, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.000721, "vq_loss_layer_028": 0.001526, "vq_loss_layer_029": 0.002106, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.006256 }, { "ce_loss": 2.299915, "epoch": 0.00496, "grad_norm": 0.00442758621647954, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.053955, "key_mse_loss_layer_004": 0.061279, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.055231, "kv_vq_loss": 0.000588, "learning_rate": 0.0009238704191225493, "loss": 0.055786, "step": 4960, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.013062, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014587, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.023682, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.020874, "value_mse_loss_layer_012": 0.021118, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.023315, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.027466, "value_mse_loss_layer_018": 0.024292, "value_mse_loss_layer_019": 0.028809, "value_mse_loss_layer_020": 0.029907, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.035156, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.047119, "value_mse_loss_layer_025": 0.052979, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.065918, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.085449, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000222, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000315, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000439, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000441, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000475, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000919, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.329522, "epoch": 0.00497, "grad_norm": 0.00308865774422884, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.054861, "kv_vq_loss": 0.000576, "learning_rate": 0.000924089097183333, "loss": 0.055423, "step": 4970, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.011841, "value_mse_loss_layer_005": 0.011841, "value_mse_loss_layer_006": 0.013977, "value_mse_loss_layer_007": 0.014893, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.020142, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.024658, "value_mse_loss_layer_017": 0.026489, "value_mse_loss_layer_018": 0.025513, "value_mse_loss_layer_019": 0.028076, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.040771, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.049316, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.056152, "value_mse_loss_layer_028": 0.05835, "value_mse_loss_layer_029": 0.083984, "value_mse_loss_layer_030": 0.080078, "value_mse_loss_layer_031": 0.086426, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000234, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000418, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000523, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.005188 }, { "ce_loss": 2.268208, "epoch": 0.00498, "grad_norm": 0.004241119604557753, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.055042, "kv_vq_loss": 0.000594, "learning_rate": 0.0009243073356899292, "loss": 0.055618, "step": 4980, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.002106, "value_mse_loss_layer_002": 0.00769, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022583, "value_mse_loss_layer_010": 0.019043, "value_mse_loss_layer_011": 0.020508, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.02356, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.023438, "value_mse_loss_layer_017": 0.025879, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.041016, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.038574, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.052002, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.060303, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.083496, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000224, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.00024, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000565, "vq_loss_layer_013": 0.000355, "vq_loss_layer_014": 0.000458, "vq_loss_layer_015": 0.000481, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000614, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000378, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000389, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.003067, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.312835, "epoch": 0.00499, "grad_norm": 0.0036219165194779634, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.081543, "key_mse_loss_layer_020": 0.088867, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.083496, "key_mse_loss_layer_024": 0.065918, "key_mse_loss_layer_025": 0.064453, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.072266, "key_mse_loss_layer_028": 0.079102, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.054752, "kv_vq_loss": 0.000577, "learning_rate": 0.0009245251364058474, "loss": 0.055286, "step": 4990, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.012146, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.011658, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.015015, "value_mse_loss_layer_008": 0.018066, "value_mse_loss_layer_009": 0.0271, "value_mse_loss_layer_010": 0.020264, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.022949, "value_mse_loss_layer_013": 0.02356, "value_mse_loss_layer_014": 0.025513, "value_mse_loss_layer_015": 0.02771, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.023071, "value_mse_loss_layer_019": 0.029785, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.037109, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.052246, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000242, "vq_loss_layer_009": 0.000534, "vq_loss_layer_010": 0.000277, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000546, "vq_loss_layer_013": 0.000406, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000433, "vq_loss_layer_017": 0.000433, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000565, "vq_loss_layer_022": 0.000336, "vq_loss_layer_023": 0.000374, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000637, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.26792, "epoch": 0.005, "grad_norm": 0.004827489145100117, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.054526, "kv_vq_loss": 0.000579, "learning_rate": 0.0009247425010840047, "loss": 0.055069, "step": 5000, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.013794, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.021362, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.024048, "value_mse_loss_layer_015": 0.026978, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.02832, "value_mse_loss_layer_019": 0.02832, "value_mse_loss_layer_020": 0.030396, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.049316, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.088867, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000322, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000271, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000488, "vq_loss_layer_016": 0.000448, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.000311, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000433, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000359, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.000839, "vq_loss_layer_029": 0.001289, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.265831, "epoch": 0.00501, "grad_norm": 0.0030214961152523756, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.054852, "kv_vq_loss": 0.000605, "learning_rate": 0.0009249594314668113, "loss": 0.055438, "step": 5010, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.017944, "value_mse_loss_layer_004": 0.01416, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.013306, "value_mse_loss_layer_007": 0.014648, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.022461, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.021118, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.023071, "value_mse_loss_layer_017": 0.0271, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.026733, "value_mse_loss_layer_020": 0.028687, "value_mse_loss_layer_021": 0.035156, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.042725, "value_mse_loss_layer_024": 0.044678, "value_mse_loss_layer_025": 0.048828, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.060303, "value_mse_loss_layer_029": 0.083496, "value_mse_loss_layer_030": 0.079102, "value_mse_loss_layer_031": 0.081055, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 4.8e-05, "vq_loss_layer_004": 0.000168, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.000208, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000427, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000452, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.00046, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000475, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.306922, "epoch": 0.00502, "grad_norm": 0.00443442165851593, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.060303, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.094238, "key_mse_loss_layer_009": 0.101562, "key_mse_loss_layer_010": 0.112793, "key_mse_loss_layer_011": 0.10791, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.143555, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.125, "key_mse_loss_layer_016": 0.120605, "key_mse_loss_layer_017": 0.119629, "key_mse_loss_layer_018": 0.131836, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.114746, "key_mse_loss_layer_021": 0.109375, "key_mse_loss_layer_022": 0.116211, "key_mse_loss_layer_023": 0.113281, "key_mse_loss_layer_024": 0.089844, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.100586, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.105469, "key_mse_loss_layer_029": 0.09082, "key_mse_loss_layer_030": 0.103516, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.05513, "kv_vq_loss": 0.000585, "learning_rate": 0.0009251759292862548, "loss": 0.055685, "step": 5020, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.014465, "value_mse_loss_layer_004": 0.012451, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.015381, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.021729, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.023804, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.024536, "value_mse_loss_layer_018": 0.022949, "value_mse_loss_layer_019": 0.026489, "value_mse_loss_layer_020": 0.027832, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.052002, "value_mse_loss_layer_026": 0.041992, "value_mse_loss_layer_027": 0.052979, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 0.000111, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000162, "vq_loss_layer_007": 0.000244, "vq_loss_layer_008": 0.000288, "vq_loss_layer_009": 0.000364, "vq_loss_layer_010": 0.000351, "vq_loss_layer_011": 0.000374, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.00046, "vq_loss_layer_014": 0.000629, "vq_loss_layer_015": 0.000599, "vq_loss_layer_016": 0.00046, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.000215, "vq_loss_layer_020": 0.000303, "vq_loss_layer_021": 0.000603, "vq_loss_layer_022": 0.000423, "vq_loss_layer_023": 0.000542, "vq_loss_layer_024": 0.000439, "vq_loss_layer_025": 0.000839, "vq_loss_layer_026": 0.000965, "vq_loss_layer_027": 0.000942, "vq_loss_layer_028": 0.002609, "vq_loss_layer_029": 0.001785, "vq_loss_layer_030": 0.003891, "vq_loss_layer_031": 0.007172 }, { "ce_loss": 2.300032, "epoch": 0.00503, "grad_norm": 0.0030705169774591923, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.081543, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.057617, "kv_mse_loss": 0.055148, "kv_vq_loss": 0.000587, "learning_rate": 0.0009253919962639817, "loss": 0.055704, "step": 5030, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.013733, "value_mse_loss_layer_004": 0.012207, "value_mse_loss_layer_005": 0.013123, "value_mse_loss_layer_006": 0.015198, "value_mse_loss_layer_007": 0.015381, "value_mse_loss_layer_008": 0.0177, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.022583, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.02417, "value_mse_loss_layer_015": 0.026733, "value_mse_loss_layer_016": 0.021729, "value_mse_loss_layer_017": 0.0271, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.026489, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.047607, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.052246, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 0.000135, "vq_loss_layer_006": 0.000257, "vq_loss_layer_007": 0.000244, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.000324, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.000395, "vq_loss_layer_014": 0.000576, "vq_loss_layer_015": 0.00061, "vq_loss_layer_016": 0.000515, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000277, "vq_loss_layer_019": 0.000218, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000353, "vq_loss_layer_024": 0.000368, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.001633, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.006958 }, { "ce_loss": 2.32631, "epoch": 0.00504, "grad_norm": 0.00316413352265954, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.054672, "kv_vq_loss": 0.000571, "learning_rate": 0.0009256076341113813, "loss": 0.055225, "step": 5040, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.011292, "value_mse_loss_layer_005": 0.01178, "value_mse_loss_layer_006": 0.013489, "value_mse_loss_layer_007": 0.014709, "value_mse_loss_layer_008": 0.0177, "value_mse_loss_layer_009": 0.02356, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.020752, "value_mse_loss_layer_012": 0.021118, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.02356, "value_mse_loss_layer_015": 0.026001, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.026855, "value_mse_loss_layer_020": 0.028687, "value_mse_loss_layer_021": 0.033203, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.049316, "value_mse_loss_layer_026": 0.048096, "value_mse_loss_layer_027": 0.055908, "value_mse_loss_layer_028": 0.061768, "value_mse_loss_layer_029": 0.077148, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.079102, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000233, "vq_loss_layer_009": 0.000343, "vq_loss_layer_010": 0.000282, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000479, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.000471, "vq_loss_layer_017": 0.000439, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000433, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000767, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.004944 }, { "ce_loss": 2.276695, "epoch": 0.00505, "grad_norm": 0.004292660858482122, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.055002, "kv_vq_loss": 0.000594, "learning_rate": 0.0009258228445296651, "loss": 0.055566, "step": 5050, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.012146, "value_mse_loss_layer_005": 0.011108, "value_mse_loss_layer_006": 0.013245, "value_mse_loss_layer_007": 0.014893, "value_mse_loss_layer_008": 0.018311, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.019043, "value_mse_loss_layer_011": 0.020996, "value_mse_loss_layer_012": 0.022095, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.024658, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.026855, "value_mse_loss_layer_020": 0.029297, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.038086, "value_mse_loss_layer_023": 0.04248, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.055908, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.092285, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 0.000109, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000226, "vq_loss_layer_008": 0.00025, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000402, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.00045, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.00024, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000393, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001587, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.31423, "epoch": 0.00506, "grad_norm": 0.003975072409957647, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.054791, "kv_vq_loss": 0.000572, "learning_rate": 0.0009260376292099497, "loss": 0.055325, "step": 5060, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007782, "value_mse_loss_layer_003": 0.016846, "value_mse_loss_layer_004": 0.011719, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.014221, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.023193, "value_mse_loss_layer_010": 0.021118, "value_mse_loss_layer_011": 0.020142, "value_mse_loss_layer_012": 0.020874, "value_mse_loss_layer_013": 0.022095, "value_mse_loss_layer_014": 0.024048, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.022339, "value_mse_loss_layer_017": 0.02417, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.035889, "value_mse_loss_layer_021": 0.031494, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.049072, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.07373, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.077637, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000189, "vq_loss_layer_007": 0.000211, "vq_loss_layer_008": 0.000233, "vq_loss_layer_009": 0.000317, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000456, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000347, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.283948, "epoch": 0.00507, "grad_norm": 0.004756398033350706, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.054605, "kv_vq_loss": 0.000596, "learning_rate": 0.0009262519898333338, "loss": 0.055167, "step": 5070, "value_mse_loss_layer_000": 0.000668, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011841, "value_mse_loss_layer_006": 0.013611, "value_mse_loss_layer_007": 0.014587, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.025635, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.025879, "value_mse_loss_layer_018": 0.022583, "value_mse_loss_layer_019": 0.027588, "value_mse_loss_layer_020": 0.030273, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.052002, "value_mse_loss_layer_028": 0.058594, "value_mse_loss_layer_029": 0.083984, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000166, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.000236, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.00024, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000786, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000488, "vq_loss_layer_016": 0.000507, "vq_loss_layer_017": 0.00045, "vq_loss_layer_018": 0.000221, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.00032, "vq_loss_layer_021": 0.000469, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000425, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.005829 }, { "ce_loss": 2.315291, "epoch": 0.00508, "grad_norm": 0.0035695258993655443, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.054675, "kv_vq_loss": 0.000591, "learning_rate": 0.0009264659280709797, "loss": 0.055246, "step": 5080, "value_mse_loss_layer_000": 0.000694, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.012451, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.015991, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.020752, "value_mse_loss_layer_012": 0.022217, "value_mse_loss_layer_013": 0.022949, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.026123, "value_mse_loss_layer_016": 0.021729, "value_mse_loss_layer_017": 0.026001, "value_mse_loss_layer_018": 0.024414, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.028809, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.045166, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.050781, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.000256, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000307, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000479, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000463, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000473, "vq_loss_layer_017": 0.000429, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000433, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.00046, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.000462, "vq_loss_layer_026": 0.000805, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.006378 }, { "ce_loss": 2.331042, "epoch": 0.00509, "grad_norm": 0.003122282912954688, "key_mse_loss_layer_000": 0.002594, "key_mse_loss_layer_001": 0.00946, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.040771, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.132812, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.071777, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.057861, "kv_mse_loss": 0.054669, "kv_vq_loss": 0.000591, "learning_rate": 0.0009266794455841897, "loss": 0.055237, "step": 5090, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.01178, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.015015, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.023315, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.02478, "value_mse_loss_layer_014": 0.025513, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.046875, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.077637, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000202, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.00029, "vq_loss_layer_009": 0.000317, "vq_loss_layer_010": 0.000301, "vq_loss_layer_011": 0.000391, "vq_loss_layer_012": 0.000526, "vq_loss_layer_013": 0.000471, "vq_loss_layer_014": 0.00066, "vq_loss_layer_015": 0.000507, "vq_loss_layer_016": 0.000465, "vq_loss_layer_017": 0.000633, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.00025, "vq_loss_layer_020": 0.000315, "vq_loss_layer_021": 0.00061, "vq_loss_layer_022": 0.000465, "vq_loss_layer_023": 0.000774, "vq_loss_layer_024": 0.000538, "vq_loss_layer_025": 0.000904, "vq_loss_layer_026": 0.000729, "vq_loss_layer_027": 0.000935, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.003448, "vq_loss_layer_031": 0.006561 }, { "ce_loss": 2.263511, "epoch": 0.0051, "grad_norm": 0.0056311762891709805, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.054895, "kv_vq_loss": 0.000584, "learning_rate": 0.0009268925440244841, "loss": 0.055453, "step": 5100, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007782, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.011719, "value_mse_loss_layer_006": 0.012939, "value_mse_loss_layer_007": 0.014099, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.022949, "value_mse_loss_layer_010": 0.018921, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.023315, "value_mse_loss_layer_015": 0.025635, "value_mse_loss_layer_016": 0.021606, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.030151, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.043213, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.049805, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.105957, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000213, "vq_loss_layer_008": 0.000203, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000467, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000467, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000429, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000435, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000422, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.311269, "epoch": 0.00511, "grad_norm": 0.0029884884133934975, "key_mse_loss_layer_000": 0.004089, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.054803, "kv_vq_loss": 0.000597, "learning_rate": 0.000927105225033678, "loss": 0.05538, "step": 5110, "value_mse_loss_layer_000": 0.000713, "value_mse_loss_layer_001": 0.002151, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.012146, "value_mse_loss_layer_005": 0.011536, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.014771, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.02002, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.024292, "value_mse_loss_layer_015": 0.025024, "value_mse_loss_layer_016": 0.026733, "value_mse_loss_layer_017": 0.025269, "value_mse_loss_layer_018": 0.024292, "value_mse_loss_layer_019": 0.029053, "value_mse_loss_layer_020": 0.032715, "value_mse_loss_layer_021": 0.040039, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.04126, "value_mse_loss_layer_024": 0.049072, "value_mse_loss_layer_025": 0.053955, "value_mse_loss_layer_026": 0.051758, "value_mse_loss_layer_027": 0.064453, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.095215, "value_mse_loss_layer_030": 0.09082, "value_mse_loss_layer_031": 0.095703, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000169, "vq_loss_layer_007": 0.000217, "vq_loss_layer_008": 0.000234, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000234, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000637, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.00046, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000637, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001404, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.005615 }, { "ce_loss": 2.285026, "epoch": 0.00512, "grad_norm": 0.004262915346771479, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.054788, "kv_vq_loss": 0.000578, "learning_rate": 0.0009273174902439576, "loss": 0.055341, "step": 5120, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.00206, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.010803, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.022705, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.023438, "value_mse_loss_layer_017": 0.029541, "value_mse_loss_layer_018": 0.026611, "value_mse_loss_layer_019": 0.027588, "value_mse_loss_layer_020": 0.030518, "value_mse_loss_layer_021": 0.032715, "value_mse_loss_layer_022": 0.03418, "value_mse_loss_layer_023": 0.039062, "value_mse_loss_layer_024": 0.040283, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.057617, "value_mse_loss_layer_028": 0.075684, "value_mse_loss_layer_029": 0.091797, "value_mse_loss_layer_030": 0.080566, "value_mse_loss_layer_031": 0.083496, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000166, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000286, "vq_loss_layer_012": 0.000435, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000534, "vq_loss_layer_018": 0.000305, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.001297, "vq_loss_layer_029": 0.001595, "vq_loss_layer_030": 0.002884, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.276261, "epoch": 0.00513, "grad_norm": 0.003154523205012083, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.054782, "kv_vq_loss": 0.000583, "learning_rate": 0.0009275293412779539, "loss": 0.055347, "step": 5130, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.011475, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.014282, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.02478, "value_mse_loss_layer_010": 0.019043, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.022217, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.026123, "value_mse_loss_layer_015": 0.026001, "value_mse_loss_layer_016": 0.022339, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.022217, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.029053, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.047607, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.08252, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 0.000108, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.000385, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000511, "vq_loss_layer_013": 0.000351, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000486, "vq_loss_layer_016": 0.000443, "vq_loss_layer_017": 0.00042, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000265, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.00473 }, { "ce_loss": 2.284729, "epoch": 0.00514, "grad_norm": 0.003879833035171032, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.055206, "kv_vq_loss": 0.000591, "learning_rate": 0.0009277407797488188, "loss": 0.055777, "step": 5140, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.00769, "value_mse_loss_layer_003": 0.013367, "value_mse_loss_layer_004": 0.01239, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.014832, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.019653, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.022095, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.024658, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.026245, "value_mse_loss_layer_020": 0.029297, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.031006, "value_mse_loss_layer_023": 0.041992, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.057861, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.057129, "value_mse_loss_layer_029": 0.08252, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.082031, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000183, "vq_loss_layer_007": 0.000235, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000271, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000469, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000496, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000228, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000626, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.00053, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000546, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001953, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.005859 }, { "ce_loss": 2.324623, "epoch": 0.00515, "grad_norm": 0.003265056759119034, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.054694, "kv_vq_loss": 0.000574, "learning_rate": 0.0009279518072602977, "loss": 0.05524, "step": 5150, "value_mse_loss_layer_000": 0.000675, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.013306, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.022583, "value_mse_loss_layer_010": 0.019897, "value_mse_loss_layer_011": 0.019897, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.021973, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.027588, "value_mse_loss_layer_016": 0.02771, "value_mse_loss_layer_017": 0.025879, "value_mse_loss_layer_018": 0.024536, "value_mse_loss_layer_019": 0.029907, "value_mse_loss_layer_020": 0.029419, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.052002, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.056396, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.083008, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000207, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000267, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.000239, "vq_loss_layer_012": 0.000425, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000496, "vq_loss_layer_016": 0.000568, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000376, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.003769, "vq_loss_layer_031": 0.005219 }, { "ce_loss": 2.292305, "epoch": 0.00516, "grad_norm": 0.0038931937888264656, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.055045, "kv_vq_loss": 0.000586, "learning_rate": 0.0009281624254068027, "loss": 0.055606, "step": 5160, "value_mse_loss_layer_000": 0.000664, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007477, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.011292, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.013245, "value_mse_loss_layer_007": 0.014893, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.023438, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.02356, "value_mse_loss_layer_014": 0.02478, "value_mse_loss_layer_015": 0.025879, "value_mse_loss_layer_016": 0.022217, "value_mse_loss_layer_017": 0.026978, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.0271, "value_mse_loss_layer_020": 0.02771, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.042969, "value_mse_loss_layer_025": 0.048828, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.068359, "value_mse_loss_layer_029": 0.09375, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.08252, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000238, "vq_loss_layer_008": 0.00023, "vq_loss_layer_009": 0.000343, "vq_loss_layer_010": 0.000263, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000431, "vq_loss_layer_013": 0.000448, "vq_loss_layer_014": 0.000473, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.00045, "vq_loss_layer_017": 0.000488, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000341, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.004761 }, { "ce_loss": 2.328262, "epoch": 0.00517, "grad_norm": 0.0034147370606660843, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.054761, "kv_vq_loss": 0.000574, "learning_rate": 0.0009283726357734855, "loss": 0.055313, "step": 5170, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.007629, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014282, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.018677, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.021973, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.021362, "value_mse_loss_layer_017": 0.025391, "value_mse_loss_layer_018": 0.027222, "value_mse_loss_layer_019": 0.02771, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.035156, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.062012, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.079102, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.000203, "vq_loss_layer_008": 0.00021, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000435, "vq_loss_layer_013": 0.000343, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000374, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000286, "vq_loss_layer_021": 0.000481, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000378, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000809, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.004608 }, { "ce_loss": 2.282953, "epoch": 0.00518, "grad_norm": 0.003922984469681978, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.054639, "kv_vq_loss": 0.000582, "learning_rate": 0.0009285824399363081, "loss": 0.055185, "step": 5180, "value_mse_loss_layer_000": 0.000664, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012268, "value_mse_loss_layer_004": 0.012024, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.014832, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022583, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.019897, "value_mse_loss_layer_012": 0.021851, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.023926, "value_mse_loss_layer_015": 0.0271, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.0271, "value_mse_loss_layer_018": 0.022705, "value_mse_loss_layer_019": 0.028931, "value_mse_loss_layer_020": 0.031128, "value_mse_loss_layer_021": 0.036621, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.050293, "value_mse_loss_layer_026": 0.047363, "value_mse_loss_layer_027": 0.05542, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.083496, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000168, "vq_loss_layer_007": 0.000234, "vq_loss_layer_008": 0.000214, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000526, "vq_loss_layer_013": 0.000378, "vq_loss_layer_014": 0.00045, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000431, "vq_loss_layer_017": 0.000492, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000298, "vq_loss_layer_020": 0.000698, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000387, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.001228, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001511, "vq_loss_layer_030": 0.003448, "vq_loss_layer_031": 0.00589 }, { "ce_loss": 2.264478, "epoch": 0.00519, "grad_norm": 0.0038619155529886484, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.054617, "kv_vq_loss": 0.000552, "learning_rate": 0.0009287918394621143, "loss": 0.055139, "step": 5190, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012512, "value_mse_loss_layer_004": 0.011292, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.013245, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.019897, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.022949, "value_mse_loss_layer_017": 0.024536, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.02771, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.055176, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.08252, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.000213, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000452, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000523, "vq_loss_layer_017": 0.000399, "vq_loss_layer_018": 0.000232, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.001442, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.005249 }, { "ce_loss": 2.281636, "epoch": 0.0052, "grad_norm": 0.0035130390897393227, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.055115, "kv_vq_loss": 0.000579, "learning_rate": 0.0009290008359086997, "loss": 0.05567, "step": 5200, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.011536, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.014771, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.019287, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.021973, "value_mse_loss_layer_015": 0.023926, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.023682, "value_mse_loss_layer_018": 0.022949, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.031494, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.043945, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.057373, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.080078, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000241, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000484, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000471, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000437, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000205, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000383, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.000399, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.005615 }, { "ce_loss": 2.302942, "epoch": 0.00521, "grad_norm": 0.00377299333922565, "key_mse_loss_layer_000": 0.002625, "key_mse_loss_layer_001": 0.009583, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.060059, "kv_mse_loss": 0.054803, "kv_vq_loss": 0.000568, "learning_rate": 0.0009292094308248809, "loss": 0.055341, "step": 5210, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012329, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.014709, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.025146, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.020142, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.02356, "value_mse_loss_layer_014": 0.025269, "value_mse_loss_layer_015": 0.025879, "value_mse_loss_layer_016": 0.022339, "value_mse_loss_layer_017": 0.02832, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.027466, "value_mse_loss_layer_020": 0.030273, "value_mse_loss_layer_021": 0.032715, "value_mse_loss_layer_022": 0.030884, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.055908, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.07666, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.078125, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000224, "vq_loss_layer_008": 0.000208, "vq_loss_layer_009": 0.000385, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.000523, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000294, "vq_loss_layer_021": 0.000471, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000729, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.004974 }, { "ce_loss": 2.270479, "epoch": 0.00522, "grad_norm": 0.004388931207358837, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.054758, "kv_vq_loss": 0.00058, "learning_rate": 0.0009294176257505654, "loss": 0.05531, "step": 5220, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.002121, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.011902, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.015076, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.018921, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.022705, "value_mse_loss_layer_013": 0.022095, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.025757, "value_mse_loss_layer_016": 0.022705, "value_mse_loss_layer_017": 0.025879, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.029297, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.046875, "value_mse_loss_layer_026": 0.040771, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.091309, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.080078, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000228, "vq_loss_layer_008": 0.000199, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000557, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000437, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.00046, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.000429, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001465, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.005096 }, { "ce_loss": 2.292957, "epoch": 0.00523, "grad_norm": 0.0038489520084112883, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.054477, "kv_vq_loss": 0.000573, "learning_rate": 0.0009296254222168185, "loss": 0.05502, "step": 5230, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007629, "value_mse_loss_layer_003": 0.012329, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.025879, "value_mse_loss_layer_018": 0.022217, "value_mse_loss_layer_019": 0.026733, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.037354, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.049316, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.058838, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.081543, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000224, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.000391, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000448, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.001244, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.004791 }, { "ce_loss": 2.235966, "epoch": 0.00524, "grad_norm": 0.0029507256112992764, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.055035, "kv_vq_loss": 0.000591, "learning_rate": 0.0009298328217459314, "loss": 0.055597, "step": 5240, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.011108, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.022583, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.021729, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.025024, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.028198, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.037598, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.072754, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.000157, "vq_loss_layer_007": 0.000219, "vq_loss_layer_008": 0.000225, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000244, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000402, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.000479, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000454, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000755, "vq_loss_layer_027": 0.000854, "vq_loss_layer_028": 0.001343, "vq_loss_layer_029": 0.002686, "vq_loss_layer_030": 0.003326, "vq_loss_layer_031": 0.006744 }, { "ce_loss": 2.29273, "epoch": 0.00525, "grad_norm": 0.0031273330096155405, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.105957, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.105957, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.092773, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.054816, "kv_vq_loss": 0.000573, "learning_rate": 0.0009300398258514891, "loss": 0.055371, "step": 5250, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.013062, "value_mse_loss_layer_004": 0.012268, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014526, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022705, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.023193, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.028442, "value_mse_loss_layer_017": 0.026855, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.030029, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.039795, "value_mse_loss_layer_024": 0.042236, "value_mse_loss_layer_025": 0.052002, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.057617, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.083984, "value_mse_loss_layer_030": 0.07959, "value_mse_loss_layer_031": 0.093262, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000456, "vq_loss_layer_013": 0.000359, "vq_loss_layer_014": 0.000467, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000683, "vq_loss_layer_017": 0.000534, "vq_loss_layer_018": 0.000286, "vq_loss_layer_019": 0.000301, "vq_loss_layer_020": 0.00029, "vq_loss_layer_021": 0.000433, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000429, "vq_loss_layer_024": 0.00038, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.001335, "vq_loss_layer_029": 0.001579, "vq_loss_layer_030": 0.00325, "vq_loss_layer_031": 0.006836 }, { "ce_loss": 2.266577, "epoch": 0.00526, "grad_norm": 0.004006203263998032, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.041016, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.061768, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.112793, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.116211, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.104004, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.09668, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.054678, "kv_vq_loss": 0.000577, "learning_rate": 0.0009302464360384348, "loss": 0.055222, "step": 5260, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.011963, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.015198, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.021118, "value_mse_loss_layer_011": 0.019165, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.023926, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.022705, "value_mse_loss_layer_019": 0.026123, "value_mse_loss_layer_020": 0.026978, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.045166, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.058105, "value_mse_loss_layer_028": 0.054932, "value_mse_loss_layer_029": 0.08252, "value_mse_loss_layer_030": 0.078613, "value_mse_loss_layer_031": 0.083984, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.4e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000246, "vq_loss_layer_008": 0.000257, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000362, "vq_loss_layer_011": 0.000305, "vq_loss_layer_012": 0.000441, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000637, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.0005, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000269, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.000641, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000675, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.002029, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.006592 }, { "ce_loss": 2.309057, "epoch": 0.00527, "grad_norm": 0.002623562002554536, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.054465, "kv_vq_loss": 0.00057, "learning_rate": 0.0009304526538031365, "loss": 0.055014, "step": 5270, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.002075, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.013245, "value_mse_loss_layer_004": 0.011292, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022949, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.024048, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.02356, "value_mse_loss_layer_015": 0.026855, "value_mse_loss_layer_016": 0.022095, "value_mse_loss_layer_017": 0.026367, "value_mse_loss_layer_018": 0.026367, "value_mse_loss_layer_019": 0.028198, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.036133, "value_mse_loss_layer_022": 0.033447, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.055176, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.07959, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.081543, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000172, "vq_loss_layer_007": 0.000217, "vq_loss_layer_008": 0.000231, "vq_loss_layer_009": 0.000315, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000622, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000448, "vq_loss_layer_017": 0.000481, "vq_loss_layer_018": 0.00034, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.00024, "vq_loss_layer_021": 0.000542, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.005005 }, { "ce_loss": 2.332121, "epoch": 0.00528, "grad_norm": 0.0029935454949736595, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.054221, "kv_vq_loss": 0.000556, "learning_rate": 0.0009306584806334529, "loss": 0.054749, "step": 5280, "value_mse_loss_layer_000": 0.000675, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.013977, "value_mse_loss_layer_008": 0.018188, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.019653, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.022583, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.020874, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.028564, "value_mse_loss_layer_021": 0.032715, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.046387, "value_mse_loss_layer_024": 0.040283, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.07666, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.083008, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.000204, "vq_loss_layer_008": 0.000298, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000257, "vq_loss_layer_012": 0.000425, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000402, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000462, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.005249 }, { "ce_loss": 2.268929, "epoch": 0.00529, "grad_norm": 0.0034414620604366064, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.054373, "kv_vq_loss": 0.000568, "learning_rate": 0.0009308639180087962, "loss": 0.054922, "step": 5290, "value_mse_loss_layer_000": 0.000668, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007629, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.012207, "value_mse_loss_layer_005": 0.012207, "value_mse_loss_layer_006": 0.013123, "value_mse_loss_layer_007": 0.015625, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.022095, "value_mse_loss_layer_010": 0.020264, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.020874, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.02417, "value_mse_loss_layer_015": 0.0271, "value_mse_loss_layer_016": 0.021851, "value_mse_loss_layer_017": 0.025635, "value_mse_loss_layer_018": 0.022705, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.027588, "value_mse_loss_layer_021": 0.036865, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.046631, "value_mse_loss_layer_026": 0.04541, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.056396, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 9.7e-05, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000299, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000433, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.000469, "vq_loss_layer_017": 0.000391, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.004883 }, { "ce_loss": 2.296533, "epoch": 0.0053, "grad_norm": 0.0041432068683207035, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.054837, "kv_vq_loss": 0.000578, "learning_rate": 0.0009310689674001971, "loss": 0.055389, "step": 5300, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007721, "value_mse_loss_layer_003": 0.013611, "value_mse_loss_layer_004": 0.011902, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.013, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.022339, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.019287, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.021973, "value_mse_loss_layer_014": 0.022339, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.028198, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.048828, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.080078, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000215, "vq_loss_layer_008": 0.000221, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000572, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000458, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.00066, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.005402 }, { "ce_loss": 2.295273, "epoch": 0.00531, "grad_norm": 0.003015313996002078, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.054541, "kv_vq_loss": 0.000561, "learning_rate": 0.0009312736302703671, "loss": 0.05509, "step": 5310, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007721, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.010986, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.014038, "value_mse_loss_layer_007": 0.014099, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.022949, "value_mse_loss_layer_010": 0.020508, "value_mse_loss_layer_011": 0.020752, "value_mse_loss_layer_012": 0.021851, "value_mse_loss_layer_013": 0.024292, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.027466, "value_mse_loss_layer_016": 0.022339, "value_mse_loss_layer_017": 0.027222, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.028931, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.036133, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.049805, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.078125, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.00058, "vq_loss_layer_016": 0.000427, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000492, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.004242 }, { "ce_loss": 2.296269, "epoch": 0.00532, "grad_norm": 0.003836331656202674, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.054379, "kv_vq_loss": 0.000565, "learning_rate": 0.0009314779080737619, "loss": 0.05491, "step": 5320, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.013855, "value_mse_loss_layer_006": 0.013611, "value_mse_loss_layer_007": 0.014587, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.022095, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.019409, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.020996, "value_mse_loss_layer_014": 0.024292, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.022949, "value_mse_loss_layer_019": 0.0271, "value_mse_loss_layer_020": 0.029785, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.033691, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.044189, "value_mse_loss_layer_027": 0.056396, "value_mse_loss_layer_028": 0.058594, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.081543, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 0.000157, "vq_loss_layer_006": 0.000169, "vq_loss_layer_007": 0.000256, "vq_loss_layer_008": 0.000235, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.000454, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000456, "vq_loss_layer_015": 0.000469, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000234, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.312104, "epoch": 0.00533, "grad_norm": 0.002773083047941327, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.054532, "kv_vq_loss": 0.000558, "learning_rate": 0.0009316818022566429, "loss": 0.05506, "step": 5330, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.014099, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.025024, "value_mse_loss_layer_016": 0.021118, "value_mse_loss_layer_017": 0.023682, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.031494, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.054932, "value_mse_loss_layer_029": 0.07666, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.076172, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000222, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000416, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.307382, "epoch": 0.00534, "grad_norm": 0.0040662470273673534, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.054456, "kv_vq_loss": 0.000571, "learning_rate": 0.0009318853142571391, "loss": 0.055002, "step": 5340, "value_mse_loss_layer_000": 0.000717, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.00766, "value_mse_loss_layer_003": 0.012085, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.013245, "value_mse_loss_layer_007": 0.013672, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.020386, "value_mse_loss_layer_017": 0.025391, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.0271, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.035889, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.039062, "value_mse_loss_layer_024": 0.040771, "value_mse_loss_layer_025": 0.050293, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.060547, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000234, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000448, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.005371 }, { "ce_loss": 2.345309, "epoch": 0.00535, "grad_norm": 0.0034219545777887106, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.054349, "kv_vq_loss": 0.000555, "learning_rate": 0.000932088445505307, "loss": 0.054886, "step": 5350, "value_mse_loss_layer_000": 0.000698, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007538, "value_mse_loss_layer_003": 0.013123, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.013123, "value_mse_loss_layer_007": 0.015137, "value_mse_loss_layer_008": 0.017822, "value_mse_loss_layer_009": 0.022095, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.019409, "value_mse_loss_layer_012": 0.02002, "value_mse_loss_layer_013": 0.021729, "value_mse_loss_layer_014": 0.022583, "value_mse_loss_layer_015": 0.026489, "value_mse_loss_layer_016": 0.024048, "value_mse_loss_layer_017": 0.026001, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.02771, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.046875, "value_mse_loss_layer_027": 0.052246, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.079102, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000261, "vq_loss_layer_008": 0.000238, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000422, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.000488, "vq_loss_layer_017": 0.000427, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.31273, "epoch": 0.00536, "grad_norm": 0.0032528163865208626, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.054504, "kv_vq_loss": 0.000561, "learning_rate": 0.0009322911974231924, "loss": 0.055042, "step": 5360, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001938, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.012634, "value_mse_loss_layer_005": 0.010803, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.013855, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.020996, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.024536, "value_mse_loss_layer_015": 0.024414, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.024658, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.026123, "value_mse_loss_layer_020": 0.02832, "value_mse_loss_layer_021": 0.035156, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.046631, "value_mse_loss_layer_027": 0.056396, "value_mse_loss_layer_028": 0.061035, "value_mse_loss_layer_029": 0.083496, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 0.000128, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000221, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000477, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000465, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000614, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.283975, "epoch": 0.00537, "grad_norm": 0.0029829475097358227, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.054886, "kv_vq_loss": 0.000562, "learning_rate": 0.0009324935714248888, "loss": 0.05542, "step": 5370, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.008179, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011108, "value_mse_loss_layer_006": 0.013855, "value_mse_loss_layer_007": 0.013855, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.019043, "value_mse_loss_layer_011": 0.02002, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.022583, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.022217, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.049072, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.07666, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.07959, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 9.1e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000191, "vq_loss_layer_007": 0.00021, "vq_loss_layer_008": 0.000221, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000267, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000465, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000185, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000456, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.00036, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.00473 }, { "ce_loss": 2.269543, "epoch": 0.00538, "grad_norm": 0.004316998645663261, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.054315, "kv_vq_loss": 0.00056, "learning_rate": 0.0009326955689165972, "loss": 0.054855, "step": 5380, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.012939, "value_mse_loss_layer_007": 0.013794, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.019409, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.021118, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.029785, "value_mse_loss_layer_021": 0.035645, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.051514, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.061279, "value_mse_loss_layer_028": 0.060059, "value_mse_loss_layer_029": 0.085449, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.08252, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.000568, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000423, "vq_loss_layer_017": 0.000391, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.000406, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000782, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.292379, "epoch": 0.00539, "grad_norm": 0.003295763162896037, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.054312, "kv_vq_loss": 0.000561, "learning_rate": 0.0009328971912966845, "loss": 0.05484, "step": 5390, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007477, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.011169, "value_mse_loss_layer_005": 0.010559, "value_mse_loss_layer_006": 0.012756, "value_mse_loss_layer_007": 0.013672, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.022705, "value_mse_loss_layer_010": 0.019531, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.021729, "value_mse_loss_layer_014": 0.024048, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.026245, "value_mse_loss_layer_020": 0.027832, "value_mse_loss_layer_021": 0.033203, "value_mse_loss_layer_022": 0.036133, "value_mse_loss_layer_023": 0.037109, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.080078, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000208, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.000236, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.000433, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000486, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000443, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.290136, "epoch": 0.0054, "grad_norm": 0.0030509671196341515, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.054456, "kv_vq_loss": 0.00056, "learning_rate": 0.000933098439955742, "loss": 0.054993, "step": 5400, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.007507, "value_mse_loss_layer_003": 0.011719, "value_mse_loss_layer_004": 0.01123, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.015198, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.022339, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.019897, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.022705, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.028564, "value_mse_loss_layer_021": 0.031494, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.043945, "value_mse_loss_layer_026": 0.040771, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000259, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000412, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000385, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000425, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.001846, "vq_loss_layer_031": 0.004425 }, { "ce_loss": 2.310896, "epoch": 0.00541, "grad_norm": 0.0035016408655792475, "key_mse_loss_layer_000": 0.004059, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.054355, "kv_vq_loss": 0.000559, "learning_rate": 0.0009332993162766422, "loss": 0.054889, "step": 5410, "value_mse_loss_layer_000": 0.000668, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.012695, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.013245, "value_mse_loss_layer_007": 0.014038, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021118, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.023193, "value_mse_loss_layer_015": 0.025757, "value_mse_loss_layer_016": 0.02124, "value_mse_loss_layer_017": 0.023682, "value_mse_loss_layer_018": 0.025024, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.02771, "value_mse_loss_layer_021": 0.033936, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.048096, "value_mse_loss_layer_025": 0.046143, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.056152, "value_mse_loss_layer_028": 0.056396, "value_mse_loss_layer_029": 0.084473, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000143, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.00023, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000452, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.00045, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000303, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000402, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.005585 }, { "ce_loss": 2.310596, "epoch": 0.00542, "grad_norm": 0.0029512273613363504, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.054266, "kv_vq_loss": 0.000557, "learning_rate": 0.0009334998216345966, "loss": 0.054807, "step": 5420, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.012451, "value_mse_loss_layer_004": 0.012207, "value_mse_loss_layer_005": 0.012573, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.014038, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.022095, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.024536, "value_mse_loss_layer_018": 0.026001, "value_mse_loss_layer_019": 0.0271, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.040771, "value_mse_loss_layer_024": 0.041504, "value_mse_loss_layer_025": 0.050049, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.058105, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.07959, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.08252, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 0.00012, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000233, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000221, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000319, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000408, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000391, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000452, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.276206, "epoch": 0.00543, "grad_norm": 0.0034122131764888763, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.054257, "kv_vq_loss": 0.000565, "learning_rate": 0.0009336999573972116, "loss": 0.054791, "step": 5430, "value_mse_loss_layer_000": 0.000755, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007507, "value_mse_loss_layer_003": 0.012085, "value_mse_loss_layer_004": 0.010742, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.013794, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.018677, "value_mse_loss_layer_011": 0.019287, "value_mse_loss_layer_012": 0.023926, "value_mse_loss_layer_013": 0.021484, "value_mse_loss_layer_014": 0.022583, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.022583, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.033936, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.049316, "value_mse_loss_layer_026": 0.041992, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.076172, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000167, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000254, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.000372, "vq_loss_layer_014": 0.000437, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000441, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000456, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.329484, "epoch": 0.00544, "grad_norm": 0.0032587205059826374, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.044189, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.054346, "kv_vq_loss": 0.000581, "learning_rate": 0.0009338997249245448, "loss": 0.054913, "step": 5440, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.008911, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.014893, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.021484, "value_mse_loss_layer_014": 0.023071, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.024536, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.026978, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.056152, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.079102, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.9e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000228, "vq_loss_layer_008": 0.00032, "vq_loss_layer_009": 0.000372, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.000389, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000486, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.000214, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000576, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000446, "vq_loss_layer_024": 0.00037, "vq_loss_layer_025": 0.000786, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.000767, "vq_loss_layer_028": 0.001343, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.006592 }, { "ce_loss": 2.345843, "epoch": 0.00545, "grad_norm": 0.0036472685169428587, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.053976, "kv_vq_loss": 0.00055, "learning_rate": 0.0009340991255691605, "loss": 0.054495, "step": 5450, "value_mse_loss_layer_000": 0.000687, "value_mse_loss_layer_001": 0.00209, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.011292, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.013489, "value_mse_loss_layer_007": 0.013977, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.020508, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.020874, "value_mse_loss_layer_013": 0.022217, "value_mse_loss_layer_014": 0.022583, "value_mse_loss_layer_015": 0.025879, "value_mse_loss_layer_016": 0.021851, "value_mse_loss_layer_017": 0.028076, "value_mse_loss_layer_018": 0.022583, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.02832, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.054932, "value_mse_loss_layer_029": 0.080566, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.080078, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000166, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000294, "vq_loss_layer_010": 0.000277, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000435, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000467, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.00061, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000242, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001289, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.294346, "epoch": 0.00546, "grad_norm": 0.002558897016569972, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.116699, "key_mse_loss_layer_016": 0.111328, "key_mse_loss_layer_017": 0.109863, "key_mse_loss_layer_018": 0.118652, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.105957, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.054205, "kv_vq_loss": 0.000552, "learning_rate": 0.0009342981606761842, "loss": 0.054733, "step": 5460, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.012268, "value_mse_loss_layer_004": 0.011719, "value_mse_loss_layer_005": 0.011963, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.015076, "value_mse_loss_layer_008": 0.016602, "value_mse_loss_layer_009": 0.020996, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.020996, "value_mse_loss_layer_014": 0.022095, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.02417, "value_mse_loss_layer_020": 0.026001, "value_mse_loss_layer_021": 0.034912, "value_mse_loss_layer_022": 0.029541, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.077637, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000265, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000469, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000671, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.006226 }, { "ce_loss": 2.303456, "epoch": 0.00547, "grad_norm": 0.0034213149920105934, "key_mse_loss_layer_000": 0.003876, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103516, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.09375, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.054187, "kv_vq_loss": 0.000544, "learning_rate": 0.0009344968315833577, "loss": 0.054703, "step": 5470, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007568, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.01178, "value_mse_loss_layer_005": 0.011047, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014221, "value_mse_loss_layer_008": 0.0177, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.019409, "value_mse_loss_layer_012": 0.021118, "value_mse_loss_layer_013": 0.023804, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.021606, "value_mse_loss_layer_017": 0.025269, "value_mse_loss_layer_018": 0.024902, "value_mse_loss_layer_019": 0.02771, "value_mse_loss_layer_020": 0.031006, "value_mse_loss_layer_021": 0.033936, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.049805, "value_mse_loss_layer_024": 0.052002, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.050293, "value_mse_loss_layer_027": 0.063965, "value_mse_loss_layer_028": 0.072266, "value_mse_loss_layer_029": 0.095215, "value_mse_loss_layer_030": 0.093262, "value_mse_loss_layer_031": 0.097656, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.000214, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000422, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000273, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000322, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.002884, "vq_loss_layer_031": 0.006104 }, { "ce_loss": 2.315917, "epoch": 0.00548, "grad_norm": 0.003305102465674281, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.067383, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.080566, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.054147, "kv_vq_loss": 0.000561, "learning_rate": 0.0009346951396210922, "loss": 0.054675, "step": 5480, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.013, "value_mse_loss_layer_007": 0.014099, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.022461, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.024536, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.023315, "value_mse_loss_layer_015": 0.025146, "value_mse_loss_layer_016": 0.021484, "value_mse_loss_layer_017": 0.024902, "value_mse_loss_layer_018": 0.022705, "value_mse_loss_layer_019": 0.030273, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.031128, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.040527, "value_mse_loss_layer_024": 0.045898, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000214, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000702, "vq_loss_layer_013": 0.000423, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000441, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.000212, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.00038, "vq_loss_layer_024": 0.00038, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.004608 }, { "ce_loss": 2.322964, "epoch": 0.00549, "grad_norm": 0.004032881930470467, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.054373, "kv_vq_loss": 0.000559, "learning_rate": 0.0009348930861125227, "loss": 0.05491, "step": 5490, "value_mse_loss_layer_000": 0.00069, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.012329, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.017578, "value_mse_loss_layer_009": 0.022095, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.021851, "value_mse_loss_layer_013": 0.022095, "value_mse_loss_layer_014": 0.024414, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.02478, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.030273, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.042725, "value_mse_loss_layer_025": 0.05249, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.054932, "value_mse_loss_layer_028": 0.057373, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.086426, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000167, "vq_loss_layer_007": 0.000215, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000307, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.000378, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000599, "vq_loss_layer_017": 0.000437, "vq_loss_layer_018": 0.00032, "vq_loss_layer_019": 0.000235, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000355, "vq_loss_layer_024": 0.000399, "vq_loss_layer_025": 0.000587, "vq_loss_layer_026": 0.000721, "vq_loss_layer_027": 0.000782, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.002029, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.006592 }, { "ce_loss": 2.300697, "epoch": 0.0055, "grad_norm": 0.0030092569068074226, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.054022, "kv_vq_loss": 0.000562, "learning_rate": 0.0009350906723735609, "loss": 0.054568, "step": 5500, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007568, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.012817, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.01355, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.018921, "value_mse_loss_layer_011": 0.02063, "value_mse_loss_layer_012": 0.020386, "value_mse_loss_layer_013": 0.021973, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.021118, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.025757, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.077148, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.000138, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000173, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000222, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000244, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000429, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000437, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000443, "vq_loss_layer_017": 0.000477, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000481, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000303, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.003616, "vq_loss_layer_031": 0.004791 }, { "ce_loss": 2.334715, "epoch": 0.00551, "grad_norm": 0.0028200028464198112, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.054089, "kv_vq_loss": 0.000546, "learning_rate": 0.0009352878997129461, "loss": 0.054611, "step": 5510, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.012085, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.012756, "value_mse_loss_layer_007": 0.014465, "value_mse_loss_layer_008": 0.016602, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.019165, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020874, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.025635, "value_mse_loss_layer_019": 0.026489, "value_mse_loss_layer_020": 0.027222, "value_mse_loss_layer_021": 0.029663, "value_mse_loss_layer_022": 0.030518, "value_mse_loss_layer_023": 0.037598, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.058594, "value_mse_loss_layer_028": 0.057129, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000214, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000313, "vq_loss_layer_010": 0.000232, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000427, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.000824, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.003555, "vq_loss_layer_031": 0.006775 }, { "ce_loss": 2.326879, "epoch": 0.00552, "grad_norm": 0.0032028653658926487, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.054224, "kv_vq_loss": 0.000557, "learning_rate": 0.0009354847694322997, "loss": 0.054767, "step": 5520, "value_mse_loss_layer_000": 0.000668, "value_mse_loss_layer_001": 0.002014, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.013977, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.011292, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.019043, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.023804, "value_mse_loss_layer_013": 0.021729, "value_mse_loss_layer_014": 0.023926, "value_mse_loss_layer_015": 0.025757, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.022217, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.02771, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.040771, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.051025, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.077637, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000231, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.00074, "vq_loss_layer_013": 0.00041, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000469, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000387, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.000387, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.005005 }, { "ce_loss": 2.287103, "epoch": 0.00553, "grad_norm": 0.003025386482477188, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.054193, "kv_vq_loss": 0.000561, "learning_rate": 0.0009356812828261745, "loss": 0.05473, "step": 5530, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007446, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.012939, "value_mse_loss_layer_007": 0.013611, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.021851, "value_mse_loss_layer_013": 0.022217, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.02124, "value_mse_loss_layer_017": 0.025391, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.026123, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000439, "vq_loss_layer_016": 0.000481, "vq_loss_layer_017": 0.000423, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000229, "vq_loss_layer_020": 0.000239, "vq_loss_layer_021": 0.000458, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.289769, "epoch": 0.00554, "grad_norm": 0.0030463875737041235, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.054199, "kv_vq_loss": 0.000552, "learning_rate": 0.0009358774411821073, "loss": 0.054724, "step": 5540, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007446, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.01123, "value_mse_loss_layer_005": 0.012024, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.014404, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.022095, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.025146, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.030029, "value_mse_loss_layer_020": 0.027588, "value_mse_loss_layer_021": 0.035156, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.053467, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.078125, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000422, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.00022, "vq_loss_layer_019": 0.000212, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000414, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000763, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.321054, "epoch": 0.00555, "grad_norm": 0.003071090904995799, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.05423, "kv_vq_loss": 0.000546, "learning_rate": 0.0009360732457806689, "loss": 0.054755, "step": 5550, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.01123, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.022461, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.020874, "value_mse_loss_layer_017": 0.024658, "value_mse_loss_layer_018": 0.022705, "value_mse_loss_layer_019": 0.026733, "value_mse_loss_layer_020": 0.027832, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.041504, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.07666, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000155, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000315, "vq_loss_layer_010": 0.000277, "vq_loss_layer_011": 0.000338, "vq_loss_layer_012": 0.000473, "vq_loss_layer_013": 0.000399, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000418, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.000336, "vq_loss_layer_023": 0.000496, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000444, "vq_loss_layer_026": 0.000637, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001244, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.312001, "epoch": 0.00556, "grad_norm": 0.0037935192231088877, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.054224, "kv_vq_loss": 0.000548, "learning_rate": 0.0009362686978955141, "loss": 0.054752, "step": 5560, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.008667, "value_mse_loss_layer_003": 0.01178, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.010803, "value_mse_loss_layer_006": 0.012695, "value_mse_loss_layer_007": 0.01416, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.019653, "value_mse_loss_layer_011": 0.019287, "value_mse_loss_layer_012": 0.019897, "value_mse_loss_layer_013": 0.021484, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.023438, "value_mse_loss_layer_017": 0.026001, "value_mse_loss_layer_018": 0.022095, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.027222, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.093262, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.07666, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000215, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.000265, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.326624, "epoch": 0.00557, "grad_norm": 0.0032154140062630177, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.053818, "kv_vq_loss": 0.000533, "learning_rate": 0.000936463798793432, "loss": 0.054324, "step": 5570, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007477, "value_mse_loss_layer_003": 0.012146, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.010803, "value_mse_loss_layer_006": 0.013489, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.022339, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.021973, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.023682, "value_mse_loss_layer_015": 0.027832, "value_mse_loss_layer_016": 0.023804, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.026978, "value_mse_loss_layer_019": 0.033203, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.049316, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.079102, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.075684, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000166, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000448, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000399, "vq_loss_layer_018": 0.00034, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000376, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.001511, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.004395 }, { "ce_loss": 2.334861, "epoch": 0.00558, "grad_norm": 0.0028408102225512266, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.054086, "kv_vq_loss": 0.000526, "learning_rate": 0.0009366585497343944, "loss": 0.054596, "step": 5580, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.01178, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.010986, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.014282, "value_mse_loss_layer_008": 0.0177, "value_mse_loss_layer_009": 0.024902, "value_mse_loss_layer_010": 0.019043, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.023804, "value_mse_loss_layer_014": 0.023926, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.021484, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.028442, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.031006, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.047607, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.051025, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.070312, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000207, "vq_loss_layer_008": 0.000246, "vq_loss_layer_009": 0.000378, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000429, "vq_loss_layer_013": 0.000448, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000423, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000393, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.294513, "epoch": 0.00559, "grad_norm": 0.003771479008719325, "key_mse_loss_layer_000": 0.002762, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.053912, "kv_vq_loss": 0.000546, "learning_rate": 0.0009368529519716057, "loss": 0.054437, "step": 5590, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.012451, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.012939, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.016724, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.019287, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.022217, "value_mse_loss_layer_014": 0.023071, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.021606, "value_mse_loss_layer_017": 0.026733, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.02832, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.07959, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 0.000131, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000278, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000431, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.000458, "vq_loss_layer_015": 0.000486, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000462, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000414, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.302117, "epoch": 0.0056, "grad_norm": 0.002991805085912347, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.054144, "kv_vq_loss": 0.000548, "learning_rate": 0.00093704700675155, "loss": 0.054669, "step": 5600, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.014465, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011475, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.014221, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.021484, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.021606, "value_mse_loss_layer_017": 0.024902, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.026611, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.033203, "value_mse_loss_layer_022": 0.033447, "value_mse_loss_layer_023": 0.037109, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.056152, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.078125, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.085938, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.00036, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.000364, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000486, "vq_loss_layer_016": 0.000456, "vq_loss_layer_017": 0.000408, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.00028, "vq_loss_layer_020": 0.000265, "vq_loss_layer_021": 0.000416, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000353, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.00046, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001549, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.005676 }, { "ce_loss": 2.328349, "epoch": 0.00561, "grad_norm": 0.0032861537765711546, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.053912, "kv_vq_loss": 0.000571, "learning_rate": 0.0009372407153140402, "loss": 0.054465, "step": 5610, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.007233, "value_mse_loss_layer_003": 0.013306, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.013977, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.019287, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.022339, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.026367, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.029419, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.046631, "value_mse_loss_layer_026": 0.041504, "value_mse_loss_layer_027": 0.054688, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000265, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.00041, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000399, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000441, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000343, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.00148, "vq_loss_layer_029": 0.00238, "vq_loss_layer_030": 0.003754, "vq_loss_layer_031": 0.006134 }, { "ce_loss": 2.311079, "epoch": 0.00562, "grad_norm": 0.003490125061944127, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.054086, "kv_vq_loss": 0.000564, "learning_rate": 0.0009374340788922652, "loss": 0.054626, "step": 5620, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.011169, "value_mse_loss_layer_005": 0.011902, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.013855, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.021851, "value_mse_loss_layer_014": 0.024902, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.024536, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.0271, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.043213, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.049805, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 0.000128, "vq_loss_layer_006": 0.000155, "vq_loss_layer_007": 0.000211, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000441, "vq_loss_layer_013": 0.000355, "vq_loss_layer_014": 0.00061, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000473, "vq_loss_layer_017": 0.000441, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000236, "vq_loss_layer_020": 0.000265, "vq_loss_layer_021": 0.000463, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000391, "vq_loss_layer_024": 0.000492, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000923, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.003265, "vq_loss_layer_031": 0.005035 }, { "ce_loss": 2.295007, "epoch": 0.00563, "grad_norm": 0.0028291288763284683, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.054169, "kv_vq_loss": 0.000565, "learning_rate": 0.0009376270987128363, "loss": 0.054718, "step": 5630, "value_mse_loss_layer_000": 0.000675, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.007477, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.01355, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.020996, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.021729, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.046875, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.076172, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.075684, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000288, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000435, "vq_loss_layer_017": 0.000404, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.001068, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.307124, "epoch": 0.00564, "grad_norm": 0.0027104620821774006, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.054181, "kv_vq_loss": 0.000565, "learning_rate": 0.0009378197759958354, "loss": 0.054727, "step": 5640, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.012512, "value_mse_loss_layer_004": 0.01123, "value_mse_loss_layer_005": 0.010803, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.014038, "value_mse_loss_layer_008": 0.016724, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.02063, "value_mse_loss_layer_011": 0.019653, "value_mse_loss_layer_012": 0.020386, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.025024, "value_mse_loss_layer_015": 0.024536, "value_mse_loss_layer_016": 0.021729, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.025269, "value_mse_loss_layer_019": 0.026611, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.03833, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.046875, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.053467, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.075684, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000206, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.000267, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000286, "vq_loss_layer_012": 0.000431, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.00061, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000475, "vq_loss_layer_017": 0.000402, "vq_loss_layer_018": 0.000336, "vq_loss_layer_019": 0.000221, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000706, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000481, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.005432 }, { "ce_loss": 2.339935, "epoch": 0.00565, "grad_norm": 0.0031067626550793648, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.107422, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.116211, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.103516, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.100098, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.054532, "kv_vq_loss": 0.000542, "learning_rate": 0.0009380121119548596, "loss": 0.055054, "step": 5650, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007385, "value_mse_loss_layer_003": 0.012329, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.014099, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.022095, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.030029, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.030762, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.036621, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.037842, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.060303, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.079102, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000341, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000355, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000233, "vq_loss_layer_021": 0.000463, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.005005 }, { "ce_loss": 2.289857, "epoch": 0.00566, "grad_norm": 0.0030777223873883486, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.063965, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.054736, "kv_vq_loss": 0.000562, "learning_rate": 0.0009382041077970677, "loss": 0.055283, "step": 5660, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007416, "value_mse_loss_layer_003": 0.012573, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.010376, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.013489, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.019897, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.025024, "value_mse_loss_layer_016": 0.021362, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.026733, "value_mse_loss_layer_020": 0.029053, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.049072, "value_mse_loss_layer_027": 0.052002, "value_mse_loss_layer_028": 0.058105, "value_mse_loss_layer_029": 0.084961, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.076172, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000214, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000294, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000294, "vq_loss_layer_012": 0.000496, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.000435, "vq_loss_layer_017": 0.000429, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000751, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000977, "vq_loss_layer_029": 0.001328, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.004547 }, { "ce_loss": 2.334204, "epoch": 0.00567, "grad_norm": 0.0025636053178459406, "key_mse_loss_layer_000": 0.003998, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.054117, "kv_vq_loss": 0.000552, "learning_rate": 0.0009383957647232267, "loss": 0.05466, "step": 5670, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.007507, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.010864, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.014282, "value_mse_loss_layer_008": 0.016724, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.018677, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.02002, "value_mse_loss_layer_013": 0.021851, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.025024, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.027588, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.033936, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.046143, "value_mse_loss_layer_026": 0.041992, "value_mse_loss_layer_027": 0.052979, "value_mse_loss_layer_028": 0.059326, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.079102, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.00023, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.000226, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000433, "vq_loss_layer_013": 0.000385, "vq_loss_layer_014": 0.000467, "vq_loss_layer_015": 0.000549, "vq_loss_layer_016": 0.000427, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.00023, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.004852 }, { "ce_loss": 2.310624, "epoch": 0.00568, "grad_norm": 0.00354875554330647, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.054156, "kv_vq_loss": 0.000547, "learning_rate": 0.0009385870839277546, "loss": 0.054691, "step": 5680, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007568, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.010986, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.014038, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.022217, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.020264, "value_mse_loss_layer_012": 0.025757, "value_mse_loss_layer_013": 0.021973, "value_mse_loss_layer_014": 0.024536, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.020386, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.02832, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.053711, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.00018, "vq_loss_layer_007": 0.000218, "vq_loss_layer_008": 0.000235, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000303, "vq_loss_layer_012": 0.000774, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000423, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000303, "vq_loss_layer_025": 0.000441, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.003159, "vq_loss_layer_031": 0.004639 }, { "ce_loss": 2.273965, "epoch": 0.00569, "grad_norm": 0.0030165105126798153, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.054434, "kv_vq_loss": 0.000569, "learning_rate": 0.0009387780665987677, "loss": 0.05499, "step": 5690, "value_mse_loss_layer_000": 0.000683, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007507, "value_mse_loss_layer_003": 0.012573, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.019653, "value_mse_loss_layer_012": 0.020874, "value_mse_loss_layer_013": 0.022095, "value_mse_loss_layer_014": 0.023926, "value_mse_loss_layer_015": 0.025879, "value_mse_loss_layer_016": 0.022949, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.027466, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.050293, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.077637, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000462, "vq_loss_layer_013": 0.000416, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000538, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000372, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000347, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000378, "vq_loss_layer_025": 0.000463, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.001129, "vq_loss_layer_029": 0.001793, "vq_loss_layer_030": 0.003464, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.30811, "epoch": 0.0057, "grad_norm": 0.002826100680977106, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.011414, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.06543, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.081055, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.109863, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.101074, "key_mse_loss_layer_020": 0.110352, "key_mse_loss_layer_021": 0.104492, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.080566, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.092773, "key_mse_loss_layer_027": 0.089844, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.054434, "kv_vq_loss": 0.000559, "learning_rate": 0.0009389687139181226, "loss": 0.054984, "step": 5700, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001999, "value_mse_loss_layer_002": 0.008362, "value_mse_loss_layer_003": 0.013062, "value_mse_loss_layer_004": 0.012268, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.013794, "value_mse_loss_layer_007": 0.014954, "value_mse_loss_layer_008": 0.017456, "value_mse_loss_layer_009": 0.022827, "value_mse_loss_layer_010": 0.020142, "value_mse_loss_layer_011": 0.020874, "value_mse_loss_layer_012": 0.021362, "value_mse_loss_layer_013": 0.023804, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.022461, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.026855, "value_mse_loss_layer_020": 0.028198, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.046631, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.07373, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.081055, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 0.000138, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000225, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.00036, "vq_loss_layer_010": 0.000322, "vq_loss_layer_011": 0.000336, "vq_loss_layer_012": 0.000462, "vq_loss_layer_013": 0.000479, "vq_loss_layer_014": 0.00053, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000587, "vq_loss_layer_017": 0.000431, "vq_loss_layer_018": 0.00036, "vq_loss_layer_019": 0.00033, "vq_loss_layer_020": 0.00033, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000429, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000481, "vq_loss_layer_025": 0.000725, "vq_loss_layer_026": 0.00074, "vq_loss_layer_027": 0.00116, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001747, "vq_loss_layer_030": 0.003891, "vq_loss_layer_031": 0.006226 }, { "ce_loss": 2.334283, "epoch": 0.00571, "grad_norm": 0.003487854963168502, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.054211, "kv_vq_loss": 0.000557, "learning_rate": 0.000939159027061462, "loss": 0.054752, "step": 5710, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007385, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.019165, "value_mse_loss_layer_012": 0.020386, "value_mse_loss_layer_013": 0.021851, "value_mse_loss_layer_014": 0.023071, "value_mse_loss_layer_015": 0.025757, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.022217, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.052246, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 0.000116, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000168, "vq_loss_layer_007": 0.00023, "vq_loss_layer_008": 0.000234, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000484, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000444, "vq_loss_layer_017": 0.000444, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000475, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.005341 }, { "ce_loss": 2.326535, "epoch": 0.00572, "grad_norm": 0.0028693077620118856, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.054092, "kv_vq_loss": 0.000564, "learning_rate": 0.0009393490071982559, "loss": 0.05463, "step": 5720, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.008484, "value_mse_loss_layer_003": 0.014893, "value_mse_loss_layer_004": 0.011841, "value_mse_loss_layer_005": 0.012695, "value_mse_loss_layer_006": 0.013306, "value_mse_loss_layer_007": 0.014648, "value_mse_loss_layer_008": 0.017944, "value_mse_loss_layer_009": 0.023804, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.020386, "value_mse_loss_layer_012": 0.022461, "value_mse_loss_layer_013": 0.023438, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.022949, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.054932, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.07666, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 0.000124, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000228, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000359, "vq_loss_layer_010": 0.000273, "vq_loss_layer_011": 0.000294, "vq_loss_layer_012": 0.000526, "vq_loss_layer_013": 0.00041, "vq_loss_layer_014": 0.000492, "vq_loss_layer_015": 0.000546, "vq_loss_layer_016": 0.000496, "vq_loss_layer_017": 0.000441, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000467, "vq_loss_layer_022": 0.000389, "vq_loss_layer_023": 0.000353, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000439, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.004883 }, { "ce_loss": 2.292066, "epoch": 0.00573, "grad_norm": 0.0028179949149489403, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.054208, "kv_vq_loss": 0.000548, "learning_rate": 0.0009395386554918475, "loss": 0.054739, "step": 5730, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.007416, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.010864, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.014221, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021118, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.021118, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.021484, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.019409, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.0271, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.077637, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000151, "vq_loss_layer_007": 0.000226, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000422, "vq_loss_layer_012": 0.000433, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000402, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000486, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000473, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000441, "vq_loss_layer_026": 0.000614, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.005188 }, { "ce_loss": 2.328986, "epoch": 0.00574, "grad_norm": 0.003100545145571232, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.054767, "kv_vq_loss": 0.000582, "learning_rate": 0.0009397279730994934, "loss": 0.055334, "step": 5740, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007263, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.010864, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.01355, "value_mse_loss_layer_008": 0.016724, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.021118, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.027954, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.056152, "value_mse_loss_layer_029": 0.078125, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.077637, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.00015, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000267, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000504, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000446, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.004395 }, { "ce_loss": 2.267918, "epoch": 0.00575, "grad_norm": 0.0027479773852974176, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.054468, "kv_vq_loss": 0.000569, "learning_rate": 0.0009399169611724075, "loss": 0.055014, "step": 5750, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007477, "value_mse_loss_layer_003": 0.011719, "value_mse_loss_layer_004": 0.012451, "value_mse_loss_layer_005": 0.010986, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.014709, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.022583, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.020508, "value_mse_loss_layer_013": 0.023071, "value_mse_loss_layer_014": 0.023315, "value_mse_loss_layer_015": 0.025635, "value_mse_loss_layer_016": 0.021118, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.022583, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.027222, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.058105, "value_mse_loss_layer_029": 0.087402, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 0.000101, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000248, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000326, "vq_loss_layer_010": 0.000311, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000452, "vq_loss_layer_013": 0.000446, "vq_loss_layer_014": 0.000515, "vq_loss_layer_015": 0.00058, "vq_loss_layer_016": 0.000492, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.000404, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000694, "vq_loss_layer_028": 0.00132, "vq_loss_layer_029": 0.001572, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.005646 }, { "ce_loss": 2.292987, "epoch": 0.00576, "grad_norm": 0.0030613557901233435, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.054108, "kv_vq_loss": 0.000559, "learning_rate": 0.000940105620855803, "loss": 0.054642, "step": 5760, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001938, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.012512, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.019165, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.022583, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.021484, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.029785, "value_mse_loss_layer_022": 0.030029, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.050537, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 0.000139, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000259, "vq_loss_layer_009": 0.000265, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000664, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000475, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000481, "vq_loss_layer_017": 0.000576, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000519, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.001442, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.005798 }, { "ce_loss": 2.290671, "epoch": 0.00577, "grad_norm": 0.0031153536401689053, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.053711, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.054636, "kv_vq_loss": 0.000563, "learning_rate": 0.0009402939532889327, "loss": 0.055191, "step": 5770, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.007629, "value_mse_loss_layer_003": 0.011719, "value_mse_loss_layer_004": 0.010376, "value_mse_loss_layer_005": 0.010803, "value_mse_loss_layer_006": 0.012756, "value_mse_loss_layer_007": 0.014282, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.020996, "value_mse_loss_layer_014": 0.023315, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.022949, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.037598, "value_mse_loss_layer_022": 0.033447, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.050781, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.058838, "value_mse_loss_layer_029": 0.078125, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.083008, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000143, "vq_loss_layer_007": 0.00024, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000446, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000486, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000427, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.317381, "epoch": 0.00578, "grad_norm": 0.003753128694370389, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.053842, "kv_vq_loss": 0.000536, "learning_rate": 0.0009404819596051321, "loss": 0.054349, "step": 5780, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007568, "value_mse_loss_layer_003": 0.012085, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014038, "value_mse_loss_layer_008": 0.016724, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.019165, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.023193, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.023315, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.023071, "value_mse_loss_layer_019": 0.029297, "value_mse_loss_layer_020": 0.028809, "value_mse_loss_layer_021": 0.033203, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.050049, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.060791, "value_mse_loss_layer_028": 0.056152, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.07959, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000161, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000207, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.000385, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.000484, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.000238, "vq_loss_layer_020": 0.00024, "vq_loss_layer_021": 0.00042, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000771, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001152, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.289423, "epoch": 0.00579, "grad_norm": 0.002668907633051276, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.054547, "kv_vq_loss": 0.000547, "learning_rate": 0.000940669640931859, "loss": 0.055078, "step": 5790, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007385, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.013367, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.023193, "value_mse_loss_layer_015": 0.024536, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.025024, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.030518, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.052002, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.075684, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000257, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000515, "vq_loss_layer_015": 0.000471, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000456, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.004303 }, { "ce_loss": 2.269799, "epoch": 0.0058, "grad_norm": 0.0030556118581444025, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.106934, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.092773, "key_mse_loss_layer_016": 0.084961, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.054526, "kv_vq_loss": 0.000575, "learning_rate": 0.0009408569983907341, "loss": 0.055084, "step": 5800, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.012451, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.010376, "value_mse_loss_layer_006": 0.012329, "value_mse_loss_layer_007": 0.013245, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.024414, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.030029, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.036621, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.057129, "value_mse_loss_layer_029": 0.080078, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.076172, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000315, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000463, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000446, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.004395 }, { "ce_loss": 2.314544, "epoch": 0.00581, "grad_norm": 0.00423622690141201, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.054416, "kv_vq_loss": 0.000545, "learning_rate": 0.0009410440330975825, "loss": 0.054932, "step": 5810, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007629, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.01123, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012329, "value_mse_loss_layer_007": 0.013428, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.020752, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019531, "value_mse_loss_layer_013": 0.023315, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.025146, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.027466, "value_mse_loss_layer_020": 0.0271, "value_mse_loss_layer_021": 0.042236, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000203, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000418, "vq_loss_layer_013": 0.000414, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000483, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.00058, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000322, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.290918, "epoch": 0.00582, "grad_norm": 0.0031146930996328592, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.099121, "key_mse_loss_layer_029": 0.09375, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.054236, "kv_vq_loss": 0.000552, "learning_rate": 0.000941230746162472, "loss": 0.054767, "step": 5820, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007446, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.011536, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.012939, "value_mse_loss_layer_007": 0.01416, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.020874, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.028198, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.041504, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.066895, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.082031, "value_mse_loss_layer_031": 0.08252, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.00021, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000244, "vq_loss_layer_010": 0.000254, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000431, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000235, "vq_loss_layer_020": 0.000265, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000383, "vq_loss_layer_025": 0.000475, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000923, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.001419, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.005371 }, { "ce_loss": 2.357266, "epoch": 0.00583, "grad_norm": 0.0026426175609230995, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.053513, "kv_vq_loss": 0.00052, "learning_rate": 0.0009414171386897534, "loss": 0.054028, "step": 5830, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007355, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.01355, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.019165, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.022339, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.026978, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.055664, "value_mse_loss_layer_028": 0.057861, "value_mse_loss_layer_029": 0.07666, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000207, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000218, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.303549, "epoch": 0.00584, "grad_norm": 0.004086795728653669, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.053964, "kv_vq_loss": 0.000535, "learning_rate": 0.0009416032117780997, "loss": 0.054477, "step": 5840, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.010742, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.012695, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.019897, "value_mse_loss_layer_012": 0.022339, "value_mse_loss_layer_013": 0.022461, "value_mse_loss_layer_014": 0.023315, "value_mse_loss_layer_015": 0.025635, "value_mse_loss_layer_016": 0.022827, "value_mse_loss_layer_017": 0.025391, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.027588, "value_mse_loss_layer_021": 0.030762, "value_mse_loss_layer_022": 0.031006, "value_mse_loss_layer_023": 0.046875, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.056396, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.07666, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000233, "vq_loss_layer_009": 0.000332, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000515, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000463, "vq_loss_layer_015": 0.000483, "vq_loss_layer_016": 0.000481, "vq_loss_layer_017": 0.000422, "vq_loss_layer_018": 0.00024, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000599, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000866, "vq_loss_layer_029": 0.001328, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.288967, "epoch": 0.00585, "grad_norm": 0.003006639424711466, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.054117, "kv_vq_loss": 0.000546, "learning_rate": 0.0009417889665205451, "loss": 0.054651, "step": 5850, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.007172, "value_mse_loss_layer_003": 0.012573, "value_mse_loss_layer_004": 0.010803, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.014099, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.023926, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.024048, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.02832, "value_mse_loss_layer_021": 0.038574, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.040283, "value_mse_loss_layer_024": 0.046143, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.057129, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.078125, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000401, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000465, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000433, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000311, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000233, "vq_loss_layer_021": 0.000481, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000341, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.322644, "epoch": 0.00586, "grad_norm": 0.0028770172502845526, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.053448, "kv_vq_loss": 0.00052, "learning_rate": 0.0009419744040045226, "loss": 0.053961, "step": 5860, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007416, "value_mse_loss_layer_003": 0.01178, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.013672, "value_mse_loss_layer_008": 0.016724, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.022095, "value_mse_loss_layer_014": 0.021973, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.022583, "value_mse_loss_layer_017": 0.024902, "value_mse_loss_layer_018": 0.023315, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.031006, "value_mse_loss_layer_023": 0.036621, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.043457, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.07373, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000203, "vq_loss_layer_008": 0.000201, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000465, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000422, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.005035 }, { "ce_loss": 2.284743, "epoch": 0.00587, "grad_norm": 0.0031799329444766045, "key_mse_loss_layer_000": 0.00705, "key_mse_loss_layer_001": 0.013306, "key_mse_loss_layer_002": 0.062012, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.053854, "kv_vq_loss": 0.000546, "learning_rate": 0.0009421595253119034, "loss": 0.054376, "step": 5870, "value_mse_loss_layer_000": 0.000736, "value_mse_loss_layer_001": 0.002045, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.012634, "value_mse_loss_layer_004": 0.012329, "value_mse_loss_layer_005": 0.011108, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.017212, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.021973, "value_mse_loss_layer_015": 0.024536, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.022949, "value_mse_loss_layer_018": 0.023804, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.029053, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.05957, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.089355, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.091797, "vq_loss_layer_000": 1e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000164, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000296, "vq_loss_layer_009": 0.000345, "vq_loss_layer_010": 0.000294, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000515, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000603, "vq_loss_layer_016": 0.000534, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000484, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000422, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000813, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.001747, "vq_loss_layer_030": 0.003433, "vq_loss_layer_031": 0.007568 }, { "ce_loss": 2.31704, "epoch": 0.00588, "grad_norm": 0.0036769257858395576, "key_mse_loss_layer_000": 0.002686, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.099121, "key_mse_loss_layer_010": 0.114258, "key_mse_loss_layer_011": 0.109863, "key_mse_loss_layer_012": 0.084961, "key_mse_loss_layer_013": 0.150391, "key_mse_loss_layer_014": 0.144531, "key_mse_loss_layer_015": 0.129883, "key_mse_loss_layer_016": 0.124023, "key_mse_loss_layer_017": 0.125, "key_mse_loss_layer_018": 0.126953, "key_mse_loss_layer_019": 0.103027, "key_mse_loss_layer_020": 0.121094, "key_mse_loss_layer_021": 0.11377, "key_mse_loss_layer_022": 0.116211, "key_mse_loss_layer_023": 0.116211, "key_mse_loss_layer_024": 0.088379, "key_mse_loss_layer_025": 0.082031, "key_mse_loss_layer_026": 0.098145, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.102051, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.09668, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05416, "kv_vq_loss": 0.000542, "learning_rate": 0.0009423443315190345, "loss": 0.054681, "step": 5880, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001907, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.014648, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.02124, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.021851, "value_mse_loss_layer_017": 0.024902, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.038818, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.9e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.000105, "vq_loss_layer_005": 0.000142, "vq_loss_layer_006": 0.000198, "vq_loss_layer_007": 0.000265, "vq_loss_layer_008": 0.000286, "vq_loss_layer_009": 0.000319, "vq_loss_layer_010": 0.000332, "vq_loss_layer_011": 0.000343, "vq_loss_layer_012": 0.000557, "vq_loss_layer_013": 0.000435, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000519, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000389, "vq_loss_layer_019": 0.000273, "vq_loss_layer_020": 0.000345, "vq_loss_layer_021": 0.000862, "vq_loss_layer_022": 0.000473, "vq_loss_layer_023": 0.00079, "vq_loss_layer_024": 0.00058, "vq_loss_layer_025": 0.000931, "vq_loss_layer_026": 0.001099, "vq_loss_layer_027": 0.000755, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.00293, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005768 }, { "ce_loss": 2.316871, "epoch": 0.00589, "grad_norm": 0.0027889178600162268, "key_mse_loss_layer_000": 0.003937, "key_mse_loss_layer_001": 0.010986, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.053699, "kv_vq_loss": 0.000529, "learning_rate": 0.0009425288236967753, "loss": 0.054221, "step": 5890, "value_mse_loss_layer_000": 0.000698, "value_mse_loss_layer_001": 0.002029, "value_mse_loss_layer_002": 0.007874, "value_mse_loss_layer_003": 0.013672, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.019409, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.028076, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.032715, "value_mse_loss_layer_022": 0.030884, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.040283, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.041504, "value_mse_loss_layer_027": 0.055908, "value_mse_loss_layer_028": 0.058105, "value_mse_loss_layer_029": 0.081543, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.07959, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.000206, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000412, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000526, "vq_loss_layer_017": 0.000477, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000259, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000439, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000839, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.001846, "vq_loss_layer_030": 0.003815, "vq_loss_layer_031": 0.006378 }, { "ce_loss": 2.304841, "epoch": 0.0059, "grad_norm": 0.0029456692282110453, "key_mse_loss_layer_000": 0.003677, "key_mse_loss_layer_001": 0.011414, "key_mse_loss_layer_002": 0.064453, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.084961, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.095703, "key_mse_loss_layer_027": 0.102539, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.099121, "key_mse_loss_layer_030": 0.10498, "key_mse_loss_layer_031": 0.084961, "kv_mse_loss": 0.053998, "kv_vq_loss": 0.000536, "learning_rate": 0.0009427130029105358, "loss": 0.054523, "step": 5900, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.007721, "value_mse_loss_layer_003": 0.012939, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.01178, "value_mse_loss_layer_006": 0.012085, "value_mse_loss_layer_007": 0.013611, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.017212, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.031738, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.02832, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.034912, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.041748, "value_mse_loss_layer_025": 0.05127, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.060547, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.083984, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.090332, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 0.000116, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000294, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000282, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000439, "vq_loss_layer_016": 0.000456, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.00071, "vq_loss_layer_019": 0.000215, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000343, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000374, "vq_loss_layer_025": 0.000584, "vq_loss_layer_026": 0.000797, "vq_loss_layer_027": 0.000889, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.002045, "vq_loss_layer_030": 0.003387, "vq_loss_layer_031": 0.00769 }, { "ce_loss": 2.309347, "epoch": 0.00591, "grad_norm": 0.004100915510207415, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.054007, "kv_vq_loss": 0.000531, "learning_rate": 0.0009428968702203137, "loss": 0.054529, "step": 5910, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001915, "value_mse_loss_layer_002": 0.007202, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.012634, "value_mse_loss_layer_007": 0.01416, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.020996, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.025635, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.024902, "value_mse_loss_layer_018": 0.024048, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.043701, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.045898, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.084961, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000222, "vq_loss_layer_008": 0.000214, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000418, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000648, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.271338, "epoch": 0.00592, "grad_norm": 0.0030034903902560472, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.054739, "kv_vq_loss": 0.000558, "learning_rate": 0.0009430804266807297, "loss": 0.055289, "step": 5920, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.00705, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.013306, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.020874, "value_mse_loss_layer_010": 0.019653, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.023193, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.027466, "value_mse_loss_layer_017": 0.02417, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.026611, "value_mse_loss_layer_020": 0.030518, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.029907, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000243, "vq_loss_layer_012": 0.000418, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000607, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000221, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000446, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000385, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.308395, "epoch": 0.00593, "grad_norm": 0.003007922787219286, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.09082, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.130859, "key_mse_loss_layer_015": 0.118164, "key_mse_loss_layer_016": 0.112305, "key_mse_loss_layer_017": 0.112793, "key_mse_loss_layer_018": 0.118652, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.109375, "key_mse_loss_layer_021": 0.104492, "key_mse_loss_layer_022": 0.10791, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.054358, "kv_vq_loss": 0.00056, "learning_rate": 0.0009432636733410655, "loss": 0.054907, "step": 5930, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001923, "value_mse_loss_layer_002": 0.007446, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.012695, "value_mse_loss_layer_007": 0.014343, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.022583, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.026733, "value_mse_loss_layer_020": 0.026367, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.066895, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.07666, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000252, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.000334, "vq_loss_layer_010": 0.000326, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000425, "vq_loss_layer_013": 0.00037, "vq_loss_layer_014": 0.000519, "vq_loss_layer_015": 0.000471, "vq_loss_layer_016": 0.000504, "vq_loss_layer_017": 0.000408, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000284, "vq_loss_layer_020": 0.000298, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000429, "vq_loss_layer_023": 0.000395, "vq_loss_layer_024": 0.00037, "vq_loss_layer_025": 0.000519, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001068, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.288395, "epoch": 0.00594, "grad_norm": 0.002551356330513954, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.053729, "kv_vq_loss": 0.000535, "learning_rate": 0.0009434466112452982, "loss": 0.054263, "step": 5940, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.001938, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.011047, "value_mse_loss_layer_006": 0.013, "value_mse_loss_layer_007": 0.013611, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.020996, "value_mse_loss_layer_013": 0.021484, "value_mse_loss_layer_014": 0.021484, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.02124, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.025024, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.031494, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.076172, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000234, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000479, "vq_loss_layer_022": 0.000317, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000296, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.004974 }, { "ce_loss": 2.310346, "epoch": 0.00595, "grad_norm": 0.0037292451597750187, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.054047, "kv_vq_loss": 0.00053, "learning_rate": 0.0009436292414321373, "loss": 0.054568, "step": 5950, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007935, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.010986, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.013489, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.021118, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.022095, "value_mse_loss_layer_015": 0.026001, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.033203, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.054199, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.080078, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000159, "vq_loss_layer_007": 0.000222, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.00041, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.0002, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000973, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.299824, "epoch": 0.00596, "grad_norm": 0.003553492948412895, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.054065, "kv_vq_loss": 0.000533, "learning_rate": 0.000943811564935059, "loss": 0.054587, "step": 5960, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.001923, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.013916, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.012695, "value_mse_loss_layer_007": 0.013184, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.025757, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.028076, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.044922, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.041992, "value_mse_loss_layer_027": 0.053955, "value_mse_loss_layer_028": 0.060547, "value_mse_loss_layer_029": 0.07959, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.078125, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.00042, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000418, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.0009, "vq_loss_layer_029": 0.001244, "vq_loss_layer_030": 0.00386, "vq_loss_layer_031": 0.004791 }, { "ce_loss": 2.269939, "epoch": 0.00597, "grad_norm": 0.002906331093981862, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.011414, "key_mse_loss_layer_002": 0.062256, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.093262, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.054453, "kv_vq_loss": 0.000568, "learning_rate": 0.0009439935827823422, "loss": 0.055008, "step": 5970, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007538, "value_mse_loss_layer_003": 0.012268, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.013428, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.022095, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.020386, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.026245, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.031982, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.037598, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.044189, "value_mse_loss_layer_027": 0.061279, "value_mse_loss_layer_028": 0.061523, "value_mse_loss_layer_029": 0.086914, "value_mse_loss_layer_030": 0.083496, "value_mse_loss_layer_031": 0.088379, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000243, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000286, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000475, "vq_loss_layer_015": 0.000542, "vq_loss_layer_016": 0.000473, "vq_loss_layer_017": 0.000401, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000239, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.000441, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000357, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000507, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.001358, "vq_loss_layer_029": 0.001923, "vq_loss_layer_030": 0.003876, "vq_loss_layer_031": 0.007385 }, { "ce_loss": 2.277607, "epoch": 0.00598, "grad_norm": 0.003699816996231675, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.0539, "kv_vq_loss": 0.000547, "learning_rate": 0.0009441752959971026, "loss": 0.054428, "step": 5980, "value_mse_loss_layer_000": 0.000648, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.007141, "value_mse_loss_layer_003": 0.011597, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.011963, "value_mse_loss_layer_006": 0.013672, "value_mse_loss_layer_007": 0.013428, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.020874, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.018921, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.021484, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.026611, "value_mse_loss_layer_018": 0.021729, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.026978, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.056885, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 0.000125, "vq_loss_layer_006": 0.000176, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000481, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000408, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.236723, "epoch": 0.00599, "grad_norm": 0.0037476366851478815, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.054224, "kv_vq_loss": 0.000548, "learning_rate": 0.0009443567055973278, "loss": 0.054761, "step": 5990, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.014221, "value_mse_loss_layer_004": 0.012573, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.012756, "value_mse_loss_layer_007": 0.01355, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.019043, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.023438, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.025146, "value_mse_loss_layer_017": 0.025635, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.031738, "value_mse_loss_layer_020": 0.028931, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.052002, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.087891, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 0.000124, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000433, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000542, "vq_loss_layer_017": 0.000431, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000238, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.005341 }, { "ce_loss": 2.265922, "epoch": 0.006, "grad_norm": 0.0036141301970928907, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.054007, "kv_vq_loss": 0.000552, "learning_rate": 0.0009445378125959107, "loss": 0.054541, "step": 6000, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007568, "value_mse_loss_layer_003": 0.012329, "value_mse_loss_layer_004": 0.01123, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.013611, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.023071, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.023315, "value_mse_loss_layer_018": 0.024048, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.00045, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000391, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000387, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.371836, "epoch": 0.00601, "grad_norm": 0.002908446593210101, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.081543, "kv_mse_loss": 0.053946, "kv_vq_loss": 0.000534, "learning_rate": 0.0009447186180006847, "loss": 0.054471, "step": 6010, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001923, "value_mse_loss_layer_002": 0.007233, "value_mse_loss_layer_003": 0.01178, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.013855, "value_mse_loss_layer_008": 0.01709, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.018921, "value_mse_loss_layer_011": 0.018921, "value_mse_loss_layer_012": 0.02002, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.025146, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.026001, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.026245, "value_mse_loss_layer_020": 0.028564, "value_mse_loss_layer_021": 0.036133, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.037109, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.046143, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.060547, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.076172, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000223, "vq_loss_layer_009": 0.000278, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000467, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000519, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000448, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001686, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.005615 }, { "ce_loss": 2.264734, "epoch": 0.00602, "grad_norm": 0.0033280691131949425, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.053818, "kv_vq_loss": 0.000539, "learning_rate": 0.0009448991228144561, "loss": 0.05433, "step": 6020, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.00705, "value_mse_loss_layer_003": 0.011047, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.01178, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.024536, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.026245, "value_mse_loss_layer_020": 0.028442, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.032227, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.046143, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.058838, "value_mse_loss_layer_029": 0.095703, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000454, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000435, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.330314, "epoch": 0.00603, "grad_norm": 0.003491588868200779, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.053473, "kv_vq_loss": 0.000515, "learning_rate": 0.0009450793280350378, "loss": 0.053979, "step": 6030, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007477, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.010803, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.022461, "value_mse_loss_layer_010": 0.019653, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.022583, "value_mse_loss_layer_014": 0.024292, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.026733, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.03064, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.054932, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.00018, "vq_loss_layer_007": 0.000203, "vq_loss_layer_008": 0.000241, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000273, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000444, "vq_loss_layer_013": 0.000381, "vq_loss_layer_014": 0.000473, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.00046, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000471, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000357, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.00042, "vq_loss_layer_026": 0.001076, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.004639 }, { "ce_loss": 2.293887, "epoch": 0.00604, "grad_norm": 0.0033001285046339035, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.053635, "kv_vq_loss": 0.000545, "learning_rate": 0.0009452592346552829, "loss": 0.054163, "step": 6040, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.001938, "value_mse_loss_layer_002": 0.007355, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.010376, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.020752, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.023071, "value_mse_loss_layer_015": 0.02417, "value_mse_loss_layer_016": 0.026245, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.044189, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.076172, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000626, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.003372, "vq_loss_layer_031": 0.004669 }, { "ce_loss": 2.299649, "epoch": 0.00605, "grad_norm": 0.003337934147566557, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.011108, "key_mse_loss_layer_002": 0.064453, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.09082, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.108887, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.134766, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.116699, "key_mse_loss_layer_016": 0.115234, "key_mse_loss_layer_017": 0.111328, "key_mse_loss_layer_018": 0.125, "key_mse_loss_layer_019": 0.100586, "key_mse_loss_layer_020": 0.112793, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.114258, "key_mse_loss_layer_023": 0.116211, "key_mse_loss_layer_024": 0.095703, "key_mse_loss_layer_025": 0.088379, "key_mse_loss_layer_026": 0.106934, "key_mse_loss_layer_027": 0.109863, "key_mse_loss_layer_028": 0.113281, "key_mse_loss_layer_029": 0.101562, "key_mse_loss_layer_030": 0.114746, "key_mse_loss_layer_031": 0.082031, "kv_mse_loss": 0.053745, "kv_vq_loss": 0.000545, "learning_rate": 0.0009454388436631172, "loss": 0.054266, "step": 6050, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.00769, "value_mse_loss_layer_003": 0.012268, "value_mse_loss_layer_004": 0.011414, "value_mse_loss_layer_005": 0.011414, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.013367, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.018677, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.022095, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.030396, "value_mse_loss_layer_022": 0.030029, "value_mse_loss_layer_023": 0.041504, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.044434, "value_mse_loss_layer_027": 0.056396, "value_mse_loss_layer_028": 0.057373, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.083008, "value_mse_loss_layer_031": 0.083984, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2.2e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 0.00011, "vq_loss_layer_006": 0.00016, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000239, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.00033, "vq_loss_layer_019": 0.000238, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000431, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000519, "vq_loss_layer_026": 0.00066, "vq_loss_layer_027": 0.000832, "vq_loss_layer_028": 0.001244, "vq_loss_layer_029": 0.002304, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.007111 }, { "ce_loss": 2.280756, "epoch": 0.00606, "grad_norm": 0.002690501743927598, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.061523, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.138672, "key_mse_loss_layer_014": 0.135742, "key_mse_loss_layer_015": 0.120605, "key_mse_loss_layer_016": 0.116699, "key_mse_loss_layer_017": 0.116699, "key_mse_loss_layer_018": 0.123535, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.10791, "key_mse_loss_layer_021": 0.104004, "key_mse_loss_layer_022": 0.108887, "key_mse_loss_layer_023": 0.108887, "key_mse_loss_layer_024": 0.084473, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.092773, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.098145, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.053802, "kv_vq_loss": 0.000533, "learning_rate": 0.0009456181560415714, "loss": 0.054324, "step": 6060, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.007233, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.013794, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019531, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.020386, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.025269, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 5.3e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000203, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.00036, "vq_loss_layer_012": 0.000481, "vq_loss_layer_013": 0.000385, "vq_loss_layer_014": 0.000538, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000435, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000483, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000368, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.005432 }, { "ce_loss": 2.298648, "epoch": 0.00607, "grad_norm": 0.0025637676008045673, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.135742, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.116211, "key_mse_loss_layer_016": 0.109863, "key_mse_loss_layer_017": 0.112305, "key_mse_loss_layer_018": 0.117188, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.10791, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.10498, "key_mse_loss_layer_023": 0.104492, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.092285, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.053647, "kv_vq_loss": 0.000514, "learning_rate": 0.0009457971727688143, "loss": 0.054153, "step": 6070, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.009583, "value_mse_loss_layer_003": 0.012268, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.012451, "value_mse_loss_layer_007": 0.013672, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.021973, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.021484, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.030396, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 3e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000153, "vq_loss_layer_007": 0.000201, "vq_loss_layer_008": 0.00024, "vq_loss_layer_009": 0.000322, "vq_loss_layer_010": 0.000273, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000435, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000484, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000404, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.00022, "vq_loss_layer_020": 0.00028, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000378, "vq_loss_layer_023": 0.000372, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.295686, "epoch": 0.00608, "grad_norm": 0.003072661580517888, "key_mse_loss_layer_000": 0.002731, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.043701, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.124023, "key_mse_loss_layer_015": 0.11084, "key_mse_loss_layer_016": 0.103027, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.054245, "kv_vq_loss": 0.000545, "learning_rate": 0.0009459758948181836, "loss": 0.054782, "step": 6080, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.001923, "value_mse_loss_layer_002": 0.007812, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.011353, "value_mse_loss_layer_006": 0.013733, "value_mse_loss_layer_007": 0.013916, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.022583, "value_mse_loss_layer_010": 0.018311, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.020996, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.02417, "value_mse_loss_layer_019": 0.023438, "value_mse_loss_layer_020": 0.027222, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.03064, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.066895, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 0.000113, "vq_loss_layer_006": 0.00022, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000372, "vq_loss_layer_010": 0.000292, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000496, "vq_loss_layer_016": 0.000425, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000282, "vq_loss_layer_019": 0.00023, "vq_loss_layer_020": 0.00028, "vq_loss_layer_021": 0.000546, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.000385, "vq_loss_layer_024": 0.000488, "vq_loss_layer_025": 0.000668, "vq_loss_layer_026": 0.00074, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.283209, "epoch": 0.00609, "grad_norm": 0.003112752689048648, "key_mse_loss_layer_000": 0.003616, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.054269, "kv_vq_loss": 0.00054, "learning_rate": 0.0009461543231582187, "loss": 0.054788, "step": 6090, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007355, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.010559, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.013611, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.019287, "value_mse_loss_layer_012": 0.02063, "value_mse_loss_layer_013": 0.021851, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.025879, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.025757, "value_mse_loss_layer_018": 0.023193, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.0271, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.033447, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.049316, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.081055, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 9.8e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000301, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000488, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000523, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000475, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000221, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.000488, "vq_loss_layer_022": 0.000372, "vq_loss_layer_023": 0.000412, "vq_loss_layer_024": 0.000378, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.00082, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001602, "vq_loss_layer_030": 0.002853, "vq_loss_layer_031": 0.005432 }, { "ce_loss": 2.2729, "epoch": 0.0061, "grad_norm": 0.0027991444803774357, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.053558, "kv_vq_loss": 0.000516, "learning_rate": 0.0009463324587526916, "loss": 0.054062, "step": 6100, "value_mse_loss_layer_000": 0.000675, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.012695, "value_mse_loss_layer_007": 0.013428, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.021729, "value_mse_loss_layer_014": 0.02356, "value_mse_loss_layer_015": 0.024536, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.022217, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.0271, "value_mse_loss_layer_021": 0.0354, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.052246, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.080078, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.083984, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000214, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000222, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.000372, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000443, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000475, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000824, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.292667, "epoch": 0.00611, "grad_norm": 0.0030353921465575695, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.05408, "kv_vq_loss": 0.000528, "learning_rate": 0.0009465103025606385, "loss": 0.054587, "step": 6110, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.00708, "value_mse_loss_layer_003": 0.011597, "value_mse_loss_layer_004": 0.010437, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012451, "value_mse_loss_layer_007": 0.013367, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.019897, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.024048, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.03064, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.059326, "value_mse_loss_layer_029": 0.077148, "value_mse_loss_layer_030": 0.075195, "value_mse_loss_layer_031": 0.074219, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000212, "vq_loss_layer_008": 0.000226, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000222, "vq_loss_layer_011": 0.000273, "vq_loss_layer_012": 0.000452, "vq_loss_layer_013": 0.000347, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.268324, "epoch": 0.00612, "grad_norm": 0.0029531088657677174, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.054071, "kv_vq_loss": 0.000547, "learning_rate": 0.0009466878555363901, "loss": 0.054608, "step": 6120, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007172, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.010376, "value_mse_loss_layer_005": 0.010376, "value_mse_loss_layer_006": 0.012268, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.021118, "value_mse_loss_layer_010": 0.017212, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.022339, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.023682, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.030273, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.052246, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000441, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.322357, "epoch": 0.00613, "grad_norm": 0.0027171974070370197, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.109375, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.117676, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.053976, "kv_vq_loss": 0.000555, "learning_rate": 0.0009468651186296037, "loss": 0.054523, "step": 6130, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007996, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.011658, "value_mse_loss_layer_005": 0.01123, "value_mse_loss_layer_006": 0.013611, "value_mse_loss_layer_007": 0.013672, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.02478, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.030518, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.075684, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 0.000106, "vq_loss_layer_006": 0.000219, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000418, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000481, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000475, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000465, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000381, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000805, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.001663, "vq_loss_layer_030": 0.003662, "vq_loss_layer_031": 0.005981 }, { "ce_loss": 2.292982, "epoch": 0.00614, "grad_norm": 0.0034811049699783325, "key_mse_loss_layer_000": 0.002472, "key_mse_loss_layer_001": 0.009583, "key_mse_loss_layer_002": 0.052246, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.054166, "kv_vq_loss": 0.000547, "learning_rate": 0.0009470420927852919, "loss": 0.054691, "step": 6140, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001892, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.010742, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.016602, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.019287, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.021851, "value_mse_loss_layer_014": 0.021973, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.025391, "value_mse_loss_layer_018": 0.020386, "value_mse_loss_layer_019": 0.024536, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.047607, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000201, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000366, "vq_loss_layer_010": 0.000263, "vq_loss_layer_011": 0.000305, "vq_loss_layer_012": 0.00041, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.00041, "vq_loss_layer_017": 0.000425, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000854, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000336, "vq_loss_layer_024": 0.000296, "vq_loss_layer_025": 0.000437, "vq_loss_layer_026": 0.000874, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.361849, "epoch": 0.00615, "grad_norm": 0.003618357004597783, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.053668, "kv_vq_loss": 0.000529, "learning_rate": 0.0009472187789438541, "loss": 0.054184, "step": 6150, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001907, "value_mse_loss_layer_002": 0.007202, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.010315, "value_mse_loss_layer_006": 0.012451, "value_mse_loss_layer_007": 0.01355, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018188, "value_mse_loss_layer_012": 0.021484, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.027466, "value_mse_loss_layer_020": 0.027588, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.056152, "value_mse_loss_layer_028": 0.056396, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000215, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000223, "vq_loss_layer_011": 0.000239, "vq_loss_layer_012": 0.000587, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000511, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.310228, "epoch": 0.00616, "grad_norm": 0.0033253824803978205, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.053824, "kv_vq_loss": 0.000536, "learning_rate": 0.0009473951780411062, "loss": 0.054349, "step": 6160, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.007324, "value_mse_loss_layer_003": 0.012817, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012634, "value_mse_loss_layer_007": 0.013489, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.021118, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.021729, "value_mse_loss_layer_015": 0.023926, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.026367, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000152, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.00041, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000252, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000338, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.000408, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.001137, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.005859 }, { "ce_loss": 2.300122, "epoch": 0.00617, "grad_norm": 0.002797802211716771, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.053397, "kv_vq_loss": 0.000522, "learning_rate": 0.0009475712910083102, "loss": 0.053903, "step": 6170, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.001923, "value_mse_loss_layer_002": 0.007324, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.010315, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.013306, "value_mse_loss_layer_008": 0.017334, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.021484, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.026245, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.02771, "value_mse_loss_layer_021": 0.030884, "value_mse_loss_layer_022": 0.030518, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.046875, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.053467, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000256, "vq_loss_layer_009": 0.000349, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000416, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000416, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.305809, "epoch": 0.00618, "grad_norm": 0.0028895698487758636, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.053815, "kv_vq_loss": 0.000528, "learning_rate": 0.0009477471187722039, "loss": 0.05433, "step": 6180, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001915, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011047, "value_mse_loss_layer_004": 0.010437, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.020996, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.026245, "value_mse_loss_layer_018": 0.023438, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.031494, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.036377, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.055664, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000418, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000446, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.004761 }, { "ce_loss": 2.327525, "epoch": 0.00619, "grad_norm": 0.004502951633185148, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.054062, "kv_vq_loss": 0.000549, "learning_rate": 0.0009479226622550295, "loss": 0.054608, "step": 6190, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001907, "value_mse_loss_layer_002": 0.007324, "value_mse_loss_layer_003": 0.010681, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.014099, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.018677, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.020752, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.027222, "value_mse_loss_layer_021": 0.043945, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.050537, "value_mse_loss_layer_025": 0.043457, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000226, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000244, "vq_loss_layer_010": 0.000223, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000401, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000471, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000185, "vq_loss_layer_020": 0.000233, "vq_loss_layer_021": 0.000656, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000427, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.003159, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.276409, "epoch": 0.0062, "grad_norm": 0.0028199595399200916, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.053958, "kv_vq_loss": 0.000541, "learning_rate": 0.0009480979223745633, "loss": 0.05448, "step": 6200, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001915, "value_mse_loss_layer_002": 0.007141, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.013489, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.020874, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.020752, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.021729, "value_mse_loss_layer_019": 0.025879, "value_mse_loss_layer_020": 0.02771, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.040283, "value_mse_loss_layer_025": 0.050049, "value_mse_loss_layer_026": 0.040771, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.059814, "value_mse_loss_layer_029": 0.087891, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000465, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000385, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000243, "vq_loss_layer_021": 0.00045, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.004059 }, { "ce_loss": 2.299157, "epoch": 0.00621, "grad_norm": 0.0034020044840872288, "key_mse_loss_layer_000": 0.004486, "key_mse_loss_layer_001": 0.012207, "key_mse_loss_layer_002": 0.063965, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.104492, "key_mse_loss_layer_024": 0.085938, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.094727, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.100098, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.05365, "kv_vq_loss": 0.000538, "learning_rate": 0.0009482729000441449, "loss": 0.054181, "step": 6210, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007355, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.010986, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.013489, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.030884, "value_mse_loss_layer_022": 0.034424, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.042969, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.044678, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.089844, "value_mse_loss_layer_030": 0.099609, "value_mse_loss_layer_031": 0.084961, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 9e-05, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000267, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000286, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000433, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.000542, "vq_loss_layer_016": 0.000456, "vq_loss_layer_017": 0.00042, "vq_loss_layer_018": 0.000303, "vq_loss_layer_019": 0.000278, "vq_loss_layer_020": 0.00024, "vq_loss_layer_021": 0.000393, "vq_loss_layer_022": 0.000368, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.00174, "vq_loss_layer_030": 0.00415, "vq_loss_layer_031": 0.006348 }, { "ce_loss": 2.281738, "epoch": 0.00622, "grad_norm": 0.0028734670486301184, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.053732, "kv_vq_loss": 0.000533, "learning_rate": 0.0009484475961727046, "loss": 0.054245, "step": 6220, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001915, "value_mse_loss_layer_002": 0.008118, "value_mse_loss_layer_003": 0.012146, "value_mse_loss_layer_004": 0.01123, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.013123, "value_mse_loss_layer_007": 0.014038, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.021729, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.024658, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.025146, "value_mse_loss_layer_020": 0.027222, "value_mse_loss_layer_021": 0.031128, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.046143, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.056396, "value_mse_loss_layer_028": 0.056396, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 0.0001, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.00024, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000416, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000538, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.325257, "epoch": 0.00623, "grad_norm": 0.0033348745200783014, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.053537, "kv_vq_loss": 0.00054, "learning_rate": 0.0009486220116647923, "loss": 0.054074, "step": 6230, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001923, "value_mse_loss_layer_002": 0.007507, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.010559, "value_mse_loss_layer_006": 0.012817, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.016968, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.019531, "value_mse_loss_layer_012": 0.019897, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.024414, "value_mse_loss_layer_016": 0.02124, "value_mse_loss_layer_017": 0.025024, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.030762, "value_mse_loss_layer_022": 0.03064, "value_mse_loss_layer_023": 0.037109, "value_mse_loss_layer_024": 0.036621, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.077148, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000225, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.000364, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000418, "vq_loss_layer_017": 0.000401, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000391, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.004547 }, { "ce_loss": 2.349345, "epoch": 0.00624, "grad_norm": 0.002469969680532813, "key_mse_loss_layer_000": 0.003967, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.053464, "kv_vq_loss": 0.000503, "learning_rate": 0.0009487961474206059, "loss": 0.053967, "step": 6240, "value_mse_loss_layer_000": 0.00066, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007324, "value_mse_loss_layer_003": 0.013123, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.012268, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.021362, "value_mse_loss_layer_010": 0.019897, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.020752, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.021606, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.027222, "value_mse_loss_layer_020": 0.032227, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.046875, "value_mse_loss_layer_026": 0.040771, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000212, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000446, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000452, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000275, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004242 }, { "ce_loss": 2.288524, "epoch": 0.00625, "grad_norm": 0.002458977047353983, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05358, "kv_vq_loss": 0.000524, "learning_rate": 0.0009489700043360187, "loss": 0.054105, "step": 6250, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001923, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.011169, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.013977, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.019409, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.022217, "value_mse_loss_layer_015": 0.025146, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.043457, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000443, "vq_loss_layer_013": 0.00037, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.000423, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000269, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000242, "vq_loss_layer_021": 0.000423, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.330715, "epoch": 0.00626, "grad_norm": 0.003220793791115284, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.053479, "kv_vq_loss": 0.000537, "learning_rate": 0.0009491435833026073, "loss": 0.054007, "step": 6260, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.001938, "value_mse_loss_layer_002": 0.007172, "value_mse_loss_layer_003": 0.01178, "value_mse_loss_layer_004": 0.010864, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.013611, "value_mse_loss_layer_008": 0.016602, "value_mse_loss_layer_009": 0.023071, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.019897, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.033936, "value_mse_loss_layer_022": 0.030518, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.050781, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.079102, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000174, "vq_loss_layer_007": 0.000206, "vq_loss_layer_008": 0.000227, "vq_loss_layer_009": 0.000364, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000431, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000399, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.004242 }, { "ce_loss": 2.309923, "epoch": 0.00627, "grad_norm": 0.00255202385596931, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.053641, "kv_vq_loss": 0.000519, "learning_rate": 0.000949316885207679, "loss": 0.054147, "step": 6270, "value_mse_loss_layer_000": 0.000671, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.012329, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.022339, "value_mse_loss_layer_015": 0.024536, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.034912, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.066895, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000294, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.000254, "vq_loss_layer_012": 0.000395, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000463, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.00023, "vq_loss_layer_021": 0.000492, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000626, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.305652, "epoch": 0.00628, "grad_norm": 0.003360537812113762, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.053485, "kv_vq_loss": 0.000527, "learning_rate": 0.0009494899109342988, "loss": 0.054001, "step": 6280, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.016602, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.019165, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.025635, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.025391, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.028687, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.046875, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.064453, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 8.6e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000223, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000234, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000481, "vq_loss_layer_014": 0.000471, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.000477, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000282, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000414, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.30557, "epoch": 0.00629, "grad_norm": 0.0029121574480086565, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.053583, "kv_vq_loss": 0.000537, "learning_rate": 0.000949662661361317, "loss": 0.054105, "step": 6290, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.001945, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.012939, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.019531, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.022827, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.022827, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.026245, "value_mse_loss_layer_021": 0.030029, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.00021, "vq_loss_layer_009": 0.000309, "vq_loss_layer_010": 0.000236, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000479, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.000486, "vq_loss_layer_016": 0.000465, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000322, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.312659, "epoch": 0.0063, "grad_norm": 0.003411669982597232, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052246, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.093262, "key_mse_loss_layer_019": 0.081055, "key_mse_loss_layer_020": 0.088867, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.084961, "key_mse_loss_layer_023": 0.082031, "key_mse_loss_layer_024": 0.062988, "key_mse_loss_layer_025": 0.063965, "key_mse_loss_layer_026": 0.071777, "key_mse_loss_layer_027": 0.070801, "key_mse_loss_layer_028": 0.078125, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.05372, "kv_vq_loss": 0.000524, "learning_rate": 0.0009498351373633952, "loss": 0.054242, "step": 6300, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001892, "value_mse_loss_layer_002": 0.007263, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.012207, "value_mse_loss_layer_007": 0.013367, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.02124, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.025391, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.026733, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.024292, "value_mse_loss_layer_020": 0.026245, "value_mse_loss_layer_021": 0.030273, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.037842, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.048096, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000203, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000429, "vq_loss_layer_013": 0.000347, "vq_loss_layer_014": 0.000471, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000538, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000252, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.000546, "vq_loss_layer_022": 0.000523, "vq_loss_layer_023": 0.000618, "vq_loss_layer_024": 0.000387, "vq_loss_layer_025": 0.000614, "vq_loss_layer_026": 0.000832, "vq_loss_layer_027": 0.001045, "vq_loss_layer_028": 0.001312, "vq_loss_layer_029": 0.002731, "vq_loss_layer_030": 0.003098, "vq_loss_layer_031": 0.006958 }, { "ce_loss": 2.269584, "epoch": 0.00631, "grad_norm": 0.002941409358754754, "key_mse_loss_layer_000": 0.003632, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.054248, "kv_vq_loss": 0.000536, "learning_rate": 0.0009500073398110334, "loss": 0.054779, "step": 6310, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007355, "value_mse_loss_layer_003": 0.01239, "value_mse_loss_layer_004": 0.011169, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.019165, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.021118, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.023193, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.029785, "value_mse_loss_layer_020": 0.026367, "value_mse_loss_layer_021": 0.030762, "value_mse_loss_layer_022": 0.029419, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.043701, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.075684, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000242, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000311, "vq_loss_layer_011": 0.000257, "vq_loss_layer_012": 0.000481, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.000226, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000444, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.003784, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.252975, "epoch": 0.00632, "grad_norm": 0.003120575100183487, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.081543, "kv_mse_loss": 0.053714, "kv_vq_loss": 0.000534, "learning_rate": 0.0009501792695705962, "loss": 0.054245, "step": 6320, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.007416, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.010559, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.020874, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.024292, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.031494, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.056885, "value_mse_loss_layer_028": 0.057129, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000284, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000353, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.00106, "vq_loss_layer_028": 0.001335, "vq_loss_layer_029": 0.001801, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.006256 }, { "ce_loss": 2.311405, "epoch": 0.00633, "grad_norm": 0.0028981491923332214, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.054199, "kv_vq_loss": 0.000527, "learning_rate": 0.0009503509275043386, "loss": 0.054715, "step": 6330, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.010315, "value_mse_loss_layer_006": 0.012268, "value_mse_loss_layer_007": 0.013489, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.020752, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.02417, "value_mse_loss_layer_016": 0.021973, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.024292, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.045166, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.079102, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.000244, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000418, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.00045, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000441, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000698, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.348951, "epoch": 0.00634, "grad_norm": 0.0033109700307250023, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.05354, "kv_vq_loss": 0.000523, "learning_rate": 0.0009505223144704332, "loss": 0.054059, "step": 6340, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001862, "value_mse_loss_layer_002": 0.006927, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.012634, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.022339, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.021606, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.024536, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.027222, "value_mse_loss_layer_021": 0.030029, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.049561, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000155, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000313, "vq_loss_layer_010": 0.000235, "vq_loss_layer_011": 0.000311, "vq_loss_layer_012": 0.000496, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.000484, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000232, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000389, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000748, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.249165, "epoch": 0.00635, "grad_norm": 0.0023395908065140247, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.053937, "kv_vq_loss": 0.000537, "learning_rate": 0.0009506934313229938, "loss": 0.054462, "step": 6350, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.011902, "value_mse_loss_layer_005": 0.010315, "value_mse_loss_layer_006": 0.012146, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.023315, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.029663, "value_mse_loss_layer_022": 0.029175, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.045166, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.073242, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000288, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000404, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.325092, "epoch": 0.00636, "grad_norm": 0.0026350715197622776, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.053372, "kv_vq_loss": 0.000532, "learning_rate": 0.0009508642789121032, "loss": 0.053894, "step": 6360, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001892, "value_mse_loss_layer_002": 0.006897, "value_mse_loss_layer_003": 0.01123, "value_mse_loss_layer_004": 0.010986, "value_mse_loss_layer_005": 0.010559, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.020874, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.023193, "value_mse_loss_layer_018": 0.022095, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.027832, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.030884, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.045166, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.054688, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.075684, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.000395, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000395, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.004303 }, { "ce_loss": 2.229973, "epoch": 0.00637, "grad_norm": 0.003315755631774664, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.05405, "kv_vq_loss": 0.000536, "learning_rate": 0.0009510348580838374, "loss": 0.054581, "step": 6370, "value_mse_loss_layer_000": 0.000656, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007202, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.010742, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.012207, "value_mse_loss_layer_007": 0.013306, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.02356, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.026367, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.030762, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.041992, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.042969, "value_mse_loss_layer_027": 0.050781, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.079102, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.00021, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000313, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000553, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000408, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.243662, "epoch": 0.00638, "grad_norm": 0.0026737398002296686, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.054132, "kv_vq_loss": 0.00054, "learning_rate": 0.0009512051696802904, "loss": 0.054663, "step": 6380, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001846, "value_mse_loss_layer_002": 0.00705, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.010864, "value_mse_loss_layer_005": 0.010559, "value_mse_loss_layer_006": 0.012939, "value_mse_loss_layer_007": 0.013184, "value_mse_loss_layer_008": 0.016724, "value_mse_loss_layer_009": 0.021118, "value_mse_loss_layer_010": 0.018799, "value_mse_loss_layer_011": 0.019043, "value_mse_loss_layer_012": 0.019531, "value_mse_loss_layer_013": 0.020752, "value_mse_loss_layer_014": 0.021729, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000162, "vq_loss_layer_007": 0.000191, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.000257, "vq_loss_layer_012": 0.000423, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000423, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000389, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.276759, "epoch": 0.00639, "grad_norm": 0.0029477160423994064, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.053458, "kv_vq_loss": 0.000513, "learning_rate": 0.0009513752145395999, "loss": 0.053961, "step": 6390, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.006836, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.012451, "value_mse_loss_layer_007": 0.013367, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.021851, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.018921, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.025024, "value_mse_loss_layer_016": 0.020386, "value_mse_loss_layer_017": 0.024048, "value_mse_loss_layer_018": 0.022705, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.027954, "value_mse_loss_layer_021": 0.031128, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.048828, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.000355, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000393, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.348855, "epoch": 0.0064, "grad_norm": 0.0029865966644138098, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.053659, "kv_vq_loss": 0.000521, "learning_rate": 0.0009515449934959716, "loss": 0.054169, "step": 6400, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.006958, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.010071, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.020752, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.02417, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.029663, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.054932, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000203, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000351, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.004059 }, { "ce_loss": 2.290136, "epoch": 0.00641, "grad_norm": 0.002868201117962599, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.05419, "kv_vq_loss": 0.000539, "learning_rate": 0.0009517145073797042, "loss": 0.054712, "step": 6410, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.00708, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.013428, "value_mse_loss_layer_007": 0.013184, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.023315, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.029541, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.000198, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000418, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000488, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.00024, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.001152, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.302747, "epoch": 0.00642, "grad_norm": 0.0029118601232767105, "key_mse_loss_layer_000": 0.006653, "key_mse_loss_layer_001": 0.015503, "key_mse_loss_layer_002": 0.064941, "key_mse_loss_layer_003": 0.058105, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.069336, "key_mse_loss_layer_006": 0.081543, "key_mse_loss_layer_007": 0.083984, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.108887, "key_mse_loss_layer_011": 0.106934, "key_mse_loss_layer_012": 0.080566, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.101074, "key_mse_loss_layer_020": 0.108398, "key_mse_loss_layer_021": 0.103516, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.084961, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.097168, "key_mse_loss_layer_027": 0.100098, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.106445, "key_mse_loss_layer_030": 0.11084, "key_mse_loss_layer_031": 0.09668, "kv_mse_loss": 0.053317, "kv_vq_loss": 0.0005, "learning_rate": 0.0009518837570172131, "loss": 0.053821, "step": 6420, "value_mse_loss_layer_000": 0.000748, "value_mse_loss_layer_001": 0.001984, "value_mse_loss_layer_002": 0.007263, "value_mse_loss_layer_003": 0.012085, "value_mse_loss_layer_004": 0.010986, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.012329, "value_mse_loss_layer_007": 0.01355, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.024658, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.021851, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.029541, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.041504, "value_mse_loss_layer_025": 0.051025, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.05542, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.085449, "value_mse_loss_layer_030": 0.084961, "value_mse_loss_layer_031": 0.087402, "vq_loss_layer_000": 1.2e-05, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 0.00011, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000239, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000299, "vq_loss_layer_011": 0.000301, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.000626, "vq_loss_layer_015": 0.000599, "vq_loss_layer_016": 0.000561, "vq_loss_layer_017": 0.000427, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.00033, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000444, "vq_loss_layer_025": 0.000698, "vq_loss_layer_026": 0.000858, "vq_loss_layer_027": 0.000946, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.002411, "vq_loss_layer_030": 0.004852, "vq_loss_layer_031": 0.006653 }, { "ce_loss": 2.280152, "epoch": 0.00643, "grad_norm": 0.002614696742966771, "key_mse_loss_layer_000": 0.00531, "key_mse_loss_layer_001": 0.012207, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.053473, "kv_vq_loss": 0.000522, "learning_rate": 0.0009520527432310555, "loss": 0.053998, "step": 6430, "value_mse_loss_layer_000": 0.000679, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.013367, "value_mse_loss_layer_007": 0.013611, "value_mse_loss_layer_008": 0.016479, "value_mse_loss_layer_009": 0.022705, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.018921, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.023926, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.024048, "value_mse_loss_layer_018": 0.021729, "value_mse_loss_layer_019": 0.025146, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.030396, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.085449, "value_mse_loss_layer_030": 0.081543, "value_mse_loss_layer_031": 0.080566, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 4.5e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000199, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.00028, "vq_loss_layer_009": 0.000359, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000307, "vq_loss_layer_012": 0.000412, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000542, "vq_loss_layer_015": 0.000542, "vq_loss_layer_016": 0.000538, "vq_loss_layer_017": 0.000458, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.000265, "vq_loss_layer_021": 0.000439, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000422, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.001595, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.006073 }, { "ce_loss": 2.330762, "epoch": 0.00644, "grad_norm": 0.004015684127807617, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.053625, "kv_vq_loss": 0.000534, "learning_rate": 0.0009522214668399528, "loss": 0.054153, "step": 6440, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001877, "value_mse_loss_layer_002": 0.006927, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.012268, "value_mse_loss_layer_007": 0.013367, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.021606, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.023926, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.028564, "value_mse_loss_layer_021": 0.028931, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.037354, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.084473, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.074219, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 9.7e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000351, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000422, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000469, "vq_loss_layer_024": 0.000395, "vq_loss_layer_025": 0.000481, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000751, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.002396, "vq_loss_layer_030": 0.003433, "vq_loss_layer_031": 0.006866 }, { "ce_loss": 2.312767, "epoch": 0.00645, "grad_norm": 0.0024793229531496763, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.053427, "kv_vq_loss": 0.000516, "learning_rate": 0.0009523899286588167, "loss": 0.053934, "step": 6450, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001892, "value_mse_loss_layer_002": 0.006927, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.010437, "value_mse_loss_layer_005": 0.010071, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.017212, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.023071, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.037842, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.048828, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000257, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000437, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000561, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000935, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.335994, "epoch": 0.00646, "grad_norm": 0.002921496285125613, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.052832, "kv_vq_loss": 0.000504, "learning_rate": 0.0009525581294987709, "loss": 0.053339, "step": 6460, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.023193, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.020874, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.026489, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.031006, "value_mse_loss_layer_023": 0.047119, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.040771, "value_mse_loss_layer_027": 0.050781, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000175, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000244, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000443, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000433, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.318937, "epoch": 0.00647, "grad_norm": 0.004889246076345444, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.053687, "kv_vq_loss": 0.000535, "learning_rate": 0.000952726070167175, "loss": 0.054211, "step": 6470, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.014404, "value_mse_loss_layer_004": 0.01001, "value_mse_loss_layer_005": 0.01001, "value_mse_loss_layer_006": 0.011902, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.020386, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.021729, "value_mse_loss_layer_015": 0.02478, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.020752, "value_mse_loss_layer_019": 0.025757, "value_mse_loss_layer_020": 0.029541, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.032471, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.049561, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.051025, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.071777, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000191, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.00046, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000242, "vq_loss_layer_021": 0.000427, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.001244, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.307802, "epoch": 0.00648, "grad_norm": 0.002797880442813039, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.053766, "kv_vq_loss": 0.000515, "learning_rate": 0.0009528937514676483, "loss": 0.054266, "step": 6480, "value_mse_loss_layer_000": 0.000652, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.006958, "value_mse_loss_layer_003": 0.012512, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.01001, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.024048, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.024536, "value_mse_loss_layer_020": 0.027466, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.043945, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.050781, "value_mse_loss_layer_028": 0.0625, "value_mse_loss_layer_029": 0.076172, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.071777, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000214, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.000221, "vq_loss_layer_011": 0.000239, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000395, "vq_loss_layer_017": 0.000402, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000213, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.334369, "epoch": 0.00649, "grad_norm": 0.00265129329636693, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.062988, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.104004, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.053455, "kv_vq_loss": 0.000508, "learning_rate": 0.0009530611742000922, "loss": 0.05397, "step": 6490, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.011719, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.011597, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.014038, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.018555, "value_mse_loss_layer_011": 0.019775, "value_mse_loss_layer_012": 0.021973, "value_mse_loss_layer_013": 0.022095, "value_mse_loss_layer_014": 0.022705, "value_mse_loss_layer_015": 0.024414, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.023682, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.030884, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.041748, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 3.8e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 0.000137, "vq_loss_layer_006": 0.000155, "vq_loss_layer_007": 0.000226, "vq_loss_layer_008": 0.000244, "vq_loss_layer_009": 0.000303, "vq_loss_layer_010": 0.000311, "vq_loss_layer_011": 0.00032, "vq_loss_layer_012": 0.000576, "vq_loss_layer_013": 0.000395, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000591, "vq_loss_layer_016": 0.000496, "vq_loss_layer_017": 0.000425, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000633, "vq_loss_layer_022": 0.000484, "vq_loss_layer_023": 0.000439, "vq_loss_layer_024": 0.000542, "vq_loss_layer_025": 0.00069, "vq_loss_layer_026": 0.000874, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.001801, "vq_loss_layer_029": 0.001549, "vq_loss_layer_030": 0.003723, "vq_loss_layer_031": 0.006714 }, { "ce_loss": 2.270924, "epoch": 0.0065, "grad_norm": 0.0029931964818388224, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.05336, "kv_vq_loss": 0.000525, "learning_rate": 0.0009532283391607137, "loss": 0.053882, "step": 6500, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.007202, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.012146, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015564, "value_mse_loss_layer_009": 0.020874, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.029907, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.053467, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.056152, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000416, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000786, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.319941, "epoch": 0.00651, "grad_norm": 0.0031327230390161276, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.067871, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.05332, "kv_vq_loss": 0.000511, "learning_rate": 0.0009533952471420478, "loss": 0.05383, "step": 6510, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.012207, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.02002, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.02124, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.020874, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.029175, "value_mse_loss_layer_021": 0.029907, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.045898, "value_mse_loss_layer_026": 0.040527, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000591, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000425, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000341, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.268557, "epoch": 0.00652, "grad_norm": 0.0032930653542280197, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.053296, "kv_vq_loss": 0.00052, "learning_rate": 0.0009535618989329799, "loss": 0.053818, "step": 6520, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.007233, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.011169, "value_mse_loss_layer_005": 0.01239, "value_mse_loss_layer_006": 0.012146, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.018188, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.025757, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.026367, "value_mse_loss_layer_021": 0.030518, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.077637, "value_mse_loss_layer_031": 0.074219, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 8.6e-05, "vq_loss_layer_005": 0.000134, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000422, "vq_loss_layer_013": 0.000402, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000429, "vq_loss_layer_017": 0.000416, "vq_loss_layer_018": 0.000278, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.000854, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.003067, "vq_loss_layer_031": 0.005005 }, { "ce_loss": 2.328793, "epoch": 0.00653, "grad_norm": 0.002619237871840596, "key_mse_loss_layer_000": 0.003632, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.053772, "kv_vq_loss": 0.000525, "learning_rate": 0.0009537282953187683, "loss": 0.054282, "step": 6530, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.001907, "value_mse_loss_layer_002": 0.007111, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.012207, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.023193, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.023071, "value_mse_loss_layer_019": 0.025024, "value_mse_loss_layer_020": 0.026001, "value_mse_loss_layer_021": 0.034668, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.036865, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.051025, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000199, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000244, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000477, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000614, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.004395 }, { "ce_loss": 2.303681, "epoch": 0.00654, "grad_norm": 0.003418211592361331, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.053375, "kv_vq_loss": 0.000501, "learning_rate": 0.0009538944370810667, "loss": 0.053888, "step": 6540, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.001915, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.009705, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.011597, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.015564, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.030029, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.04126, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.086914, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000228, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.303064, "epoch": 0.00655, "grad_norm": 0.0028658583760261536, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.053275, "kv_vq_loss": 0.000523, "learning_rate": 0.0009540603249979456, "loss": 0.053787, "step": 6550, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001846, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.010864, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.012634, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.016357, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019409, "value_mse_loss_layer_017": 0.022827, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.029663, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.047119, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.07373, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000162, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000429, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000234, "vq_loss_layer_019": 0.000223, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000315, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.004425 }, { "ce_loss": 2.22154, "epoch": 0.00656, "grad_norm": 0.0032902054954320192, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.053656, "kv_vq_loss": 0.000523, "learning_rate": 0.000954225959843915, "loss": 0.054166, "step": 6560, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001907, "value_mse_loss_layer_002": 0.007233, "value_mse_loss_layer_003": 0.01123, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.010559, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.013306, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.018066, "value_mse_loss_layer_011": 0.018799, "value_mse_loss_layer_012": 0.019409, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.020386, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.026611, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.032227, "value_mse_loss_layer_022": 0.032959, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.050293, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.059326, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000207, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000412, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.000446, "vq_loss_layer_017": 0.00045, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000469, "vq_loss_layer_022": 0.000368, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000835, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.003937 }, { "ce_loss": 2.277249, "epoch": 0.00657, "grad_norm": 0.003173790406435728, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.053235, "kv_vq_loss": 0.000501, "learning_rate": 0.000954391342389945, "loss": 0.053726, "step": 6570, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.00708, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.013, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.021606, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.022095, "value_mse_loss_layer_015": 0.02417, "value_mse_loss_layer_016": 0.021606, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.026001, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.033447, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.050537, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.057129, "value_mse_loss_layer_029": 0.083008, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.00045, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001389, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.269297, "epoch": 0.00658, "grad_norm": 0.0027390215545892715, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.053439, "kv_vq_loss": 0.000516, "learning_rate": 0.0009545564734034887, "loss": 0.05394, "step": 6580, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001877, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.010803, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.012146, "value_mse_loss_layer_007": 0.013, "value_mse_loss_layer_008": 0.016846, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.023926, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.025146, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.030762, "value_mse_loss_layer_022": 0.030884, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.076172, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 0.000126, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000215, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.00023, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000471, "vq_loss_layer_013": 0.000347, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000437, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.304967, "epoch": 0.00659, "grad_norm": 0.002679035533219576, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.058838, "kv_mse_loss": 0.053027, "kv_vq_loss": 0.000499, "learning_rate": 0.0009547213536485023, "loss": 0.053534, "step": 6590, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.007324, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.010803, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.012146, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.018677, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.022339, "value_mse_loss_layer_015": 0.02417, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.02478, "value_mse_loss_layer_021": 0.029785, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.037842, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.057373, "value_mse_loss_layer_029": 0.072754, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000519, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000355, "vq_loss_layer_024": 0.000477, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.000736, "vq_loss_layer_027": 0.000824, "vq_loss_layer_028": 0.001396, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005798 }, { "ce_loss": 2.280131, "epoch": 0.0066, "grad_norm": 0.0027565264608711004, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.053445, "kv_vq_loss": 0.000507, "learning_rate": 0.0009548859838854671, "loss": 0.053958, "step": 6600, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001907, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012695, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.024414, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.021729, "value_mse_loss_layer_019": 0.027344, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.030518, "value_mse_loss_layer_022": 0.030884, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.049072, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000191, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000483, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.003769 }, { "ce_loss": 2.237997, "epoch": 0.00661, "grad_norm": 0.002711731940507889, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.053543, "kv_vq_loss": 0.000519, "learning_rate": 0.0009550503648714099, "loss": 0.054056, "step": 6610, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006897, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.009705, "value_mse_loss_layer_006": 0.012207, "value_mse_loss_layer_007": 0.012573, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.030396, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.05542, "value_mse_loss_layer_028": 0.053467, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000401, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000416, "vq_loss_layer_017": 0.000418, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.254795, "epoch": 0.00662, "grad_norm": 0.0026990759652107954, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.053458, "kv_vq_loss": 0.000536, "learning_rate": 0.0009552144973599249, "loss": 0.053986, "step": 6620, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.010742, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.012939, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.021484, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.021484, "value_mse_loss_layer_015": 0.024414, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.025757, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.036621, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.053223, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.053955, "value_mse_loss_layer_028": 0.060791, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.073242, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.305454, "epoch": 0.00663, "grad_norm": 0.0022068654652684927, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.053366, "kv_vq_loss": 0.000534, "learning_rate": 0.0009553783821011931, "loss": 0.053894, "step": 6630, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.001892, "value_mse_loss_layer_002": 0.006836, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.010071, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.013184, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.022461, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.031128, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.042236, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.00024, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000347, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.004059 }, { "ce_loss": 2.317698, "epoch": 0.00664, "grad_norm": 0.003255806164816022, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.08252, "key_mse_loss_layer_010": 0.093262, "key_mse_loss_layer_011": 0.093262, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.092285, "key_mse_loss_layer_016": 0.084473, "key_mse_loss_layer_017": 0.088379, "key_mse_loss_layer_018": 0.093262, "key_mse_loss_layer_019": 0.080078, "key_mse_loss_layer_020": 0.087402, "key_mse_loss_layer_021": 0.083984, "key_mse_loss_layer_022": 0.084961, "key_mse_loss_layer_023": 0.08252, "key_mse_loss_layer_024": 0.065918, "key_mse_loss_layer_025": 0.064453, "key_mse_loss_layer_026": 0.072266, "key_mse_loss_layer_027": 0.072266, "key_mse_loss_layer_028": 0.079102, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.053424, "kv_vq_loss": 0.000512, "learning_rate": 0.0009555420198420043, "loss": 0.053937, "step": 6640, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001968, "value_mse_loss_layer_002": 0.00708, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.010315, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.020996, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.02417, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.030151, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.046143, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000433, "vq_loss_layer_017": 0.000437, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.00042, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.00038, "vq_loss_layer_024": 0.000364, "vq_loss_layer_025": 0.000484, "vq_loss_layer_026": 0.000702, "vq_loss_layer_027": 0.000832, "vq_loss_layer_028": 0.001625, "vq_loss_layer_029": 0.002884, "vq_loss_layer_030": 0.003677, "vq_loss_layer_031": 0.007721 }, { "ce_loss": 2.283981, "epoch": 0.00665, "grad_norm": 0.0031133319716900587, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.066406, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.084961, "kv_mse_loss": 0.053198, "kv_vq_loss": 0.000528, "learning_rate": 0.000955705411325776, "loss": 0.05372, "step": 6650, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.010498, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.012573, "value_mse_loss_layer_008": 0.014893, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.024048, "value_mse_loss_layer_016": 0.020874, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.028442, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000463, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000418, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000782, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003937 }, { "ce_loss": 2.318037, "epoch": 0.00666, "grad_norm": 0.0025753977242857218, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.053397, "kv_vq_loss": 0.000507, "learning_rate": 0.0009558685572925751, "loss": 0.0539, "step": 6660, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001862, "value_mse_loss_layer_002": 0.007202, "value_mse_loss_layer_003": 0.012024, "value_mse_loss_layer_004": 0.010376, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011841, "value_mse_loss_layer_007": 0.012451, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.021118, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.025513, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.043457, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.045166, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000301, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000523, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000584, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.315283, "epoch": 0.00667, "grad_norm": 0.003070100909098983, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.093262, "key_mse_loss_layer_009": 0.100586, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.080566, "key_mse_loss_layer_013": 0.139648, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.125, "key_mse_loss_layer_016": 0.117676, "key_mse_loss_layer_017": 0.115723, "key_mse_loss_layer_018": 0.122559, "key_mse_loss_layer_019": 0.097168, "key_mse_loss_layer_020": 0.113281, "key_mse_loss_layer_021": 0.105957, "key_mse_loss_layer_022": 0.112793, "key_mse_loss_layer_023": 0.110352, "key_mse_loss_layer_024": 0.087402, "key_mse_loss_layer_025": 0.08252, "key_mse_loss_layer_026": 0.098145, "key_mse_loss_layer_027": 0.097656, "key_mse_loss_layer_028": 0.10498, "key_mse_loss_layer_029": 0.092285, "key_mse_loss_layer_030": 0.101562, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.053421, "kv_vq_loss": 0.000518, "learning_rate": 0.0009560314584791371, "loss": 0.053934, "step": 6670, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.00193, "value_mse_loss_layer_002": 0.007263, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.010437, "value_mse_loss_layer_005": 0.010071, "value_mse_loss_layer_006": 0.012329, "value_mse_loss_layer_007": 0.013733, "value_mse_loss_layer_008": 0.015381, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.030396, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000242, "vq_loss_layer_008": 0.000242, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.00032, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000568, "vq_loss_layer_013": 0.000378, "vq_loss_layer_014": 0.000648, "vq_loss_layer_015": 0.0005, "vq_loss_layer_016": 0.000441, "vq_loss_layer_017": 0.000374, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.000319, "vq_loss_layer_021": 0.000607, "vq_loss_layer_022": 0.000357, "vq_loss_layer_023": 0.000603, "vq_loss_layer_024": 0.000385, "vq_loss_layer_025": 0.000652, "vq_loss_layer_026": 0.00071, "vq_loss_layer_027": 0.00082, "vq_loss_layer_028": 0.001434, "vq_loss_layer_029": 0.001289, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.00589 }, { "ce_loss": 2.319777, "epoch": 0.00668, "grad_norm": 0.00314682861790061, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.053314, "kv_vq_loss": 0.000526, "learning_rate": 0.0009561941156188864, "loss": 0.053842, "step": 6680, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001762, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.011047, "value_mse_loss_layer_004": 0.010376, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.015137, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.025024, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.028687, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.072266, "value_mse_loss_layer_030": 0.079102, "value_mse_loss_layer_031": 0.076172, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.000211, "vq_loss_layer_008": 0.000218, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000275, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.000444, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000425, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000374, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000408, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.00045, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.001228, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.003708, "vq_loss_layer_031": 0.005402 }, { "ce_loss": 2.339701, "epoch": 0.00669, "grad_norm": 0.002945425920188427, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.052908, "kv_vq_loss": 0.000527, "learning_rate": 0.0009563565294419558, "loss": 0.05343, "step": 6690, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.001892, "value_mse_loss_layer_002": 0.007599, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.013062, "value_mse_loss_layer_007": 0.012939, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020752, "value_mse_loss_layer_010": 0.017212, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.018677, "value_mse_loss_layer_013": 0.020752, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.024414, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.026367, "value_mse_loss_layer_020": 0.026001, "value_mse_loss_layer_021": 0.030884, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000183, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000401, "vq_loss_layer_013": 0.000343, "vq_loss_layer_014": 0.000437, "vq_loss_layer_015": 0.000471, "vq_loss_layer_016": 0.000425, "vq_loss_layer_017": 0.000391, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000242, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.001366, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.003128, "vq_loss_layer_031": 0.004242 }, { "ce_loss": 2.3227, "epoch": 0.0067, "grad_norm": 0.002706312807276845, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.074219, "key_mse_loss_layer_030": 0.070801, "key_mse_loss_layer_031": 0.055664, "kv_mse_loss": 0.052985, "kv_vq_loss": 0.000502, "learning_rate": 0.0009565187006752064, "loss": 0.053485, "step": 6700, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001915, "value_mse_loss_layer_002": 0.007233, "value_mse_loss_layer_003": 0.013, "value_mse_loss_layer_004": 0.011108, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012878, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020996, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.020264, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.024536, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.025635, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.025391, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.03418, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 5.3e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000507, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.000463, "vq_loss_layer_015": 0.000496, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000456, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000232, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000603, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.000458, "vq_loss_layer_026": 0.000809, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.005768 }, { "ce_loss": 2.320021, "epoch": 0.00671, "grad_norm": 0.0030828858725726604, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.053073, "kv_vq_loss": 0.000516, "learning_rate": 0.0009566806300422479, "loss": 0.05358, "step": 6710, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.01001, "value_mse_loss_layer_006": 0.011902, "value_mse_loss_layer_007": 0.013184, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.019409, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.037842, "value_mse_loss_layer_021": 0.03064, "value_mse_loss_layer_022": 0.031738, "value_mse_loss_layer_023": 0.035889, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047852, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000372, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000313, "vq_loss_layer_021": 0.000425, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.272815, "epoch": 0.00672, "grad_norm": 0.0023379165213555098, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.095215, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.053644, "kv_vq_loss": 0.000513, "learning_rate": 0.0009568423182634561, "loss": 0.054156, "step": 6720, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001862, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.013306, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.020386, "value_mse_loss_layer_019": 0.025635, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.029907, "value_mse_loss_layer_022": 0.030029, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.043945, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000222, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000416, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000418, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000408, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.307546, "epoch": 0.00673, "grad_norm": 0.00333505030721426, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.053165, "kv_vq_loss": 0.000504, "learning_rate": 0.0009570037660559941, "loss": 0.05368, "step": 6730, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.007172, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.012329, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.023804, "value_mse_loss_layer_015": 0.023071, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.029297, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.031128, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.058105, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000435, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000626, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000378, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000698, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.344043, "epoch": 0.00674, "grad_norm": 0.0028101264033466578, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.130859, "key_mse_loss_layer_015": 0.116211, "key_mse_loss_layer_016": 0.111328, "key_mse_loss_layer_017": 0.109375, "key_mse_loss_layer_018": 0.118164, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.108398, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.106445, "key_mse_loss_layer_023": 0.10498, "key_mse_loss_layer_024": 0.085938, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.097656, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.052939, "kv_vq_loss": 0.000514, "learning_rate": 0.0009571649741338297, "loss": 0.053455, "step": 6740, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007263, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.010742, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.015564, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.041016, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 4.4e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000207, "vq_loss_layer_008": 0.000275, "vq_loss_layer_009": 0.00029, "vq_loss_layer_010": 0.000299, "vq_loss_layer_011": 0.000322, "vq_loss_layer_012": 0.000504, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000557, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000444, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000284, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000885, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000546, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.001457, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.319903, "epoch": 0.00675, "grad_norm": 0.0026789456605911255, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052844, "kv_vq_loss": 0.000496, "learning_rate": 0.000957325943207756, "loss": 0.053345, "step": 6750, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.007111, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012939, "value_mse_loss_layer_008": 0.015442, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.020386, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.043457, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.07666, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 9.2e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000269, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000271, "vq_loss_layer_012": 0.000425, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000479, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000433, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000481, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.00033, "vq_loss_layer_025": 0.000587, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.000721, "vq_loss_layer_028": 0.00106, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.00354, "vq_loss_layer_031": 0.006805 }, { "ce_loss": 2.280897, "epoch": 0.00676, "grad_norm": 0.0032360428012907505, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.08252, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.098145, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.053741, "kv_vq_loss": 0.000522, "learning_rate": 0.000957486673985409, "loss": 0.054254, "step": 6760, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.010437, "value_mse_loss_layer_006": 0.012573, "value_mse_loss_layer_007": 0.012695, "value_mse_loss_layer_008": 0.015259, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.018677, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.030273, "value_mse_loss_layer_022": 0.032715, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.044189, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.07666, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.077148, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000303, "vq_loss_layer_010": 0.000257, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.00046, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000418, "vq_loss_layer_017": 0.000422, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000368, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.352419, "epoch": 0.00677, "grad_norm": 0.002336920006200671, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.052774, "kv_vq_loss": 0.000494, "learning_rate": 0.000957647167171286, "loss": 0.053275, "step": 6770, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001862, "value_mse_loss_layer_002": 0.006683, "value_mse_loss_layer_003": 0.012512, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.011597, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.030151, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.042236, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.056152, "value_mse_loss_layer_029": 0.072266, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.071777, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.000343, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.286412, "epoch": 0.00678, "grad_norm": 0.004107177723199129, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.053409, "kv_vq_loss": 0.000514, "learning_rate": 0.0009578074234667659, "loss": 0.053918, "step": 6780, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.007416, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.010437, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011841, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.024048, "value_mse_loss_layer_018": 0.020752, "value_mse_loss_layer_019": 0.02417, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.031128, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.043213, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.090332, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 8.2e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000406, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000422, "vq_loss_layer_017": 0.000412, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.0009, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.273754, "epoch": 0.00679, "grad_norm": 0.0032379399053752422, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.053265, "kv_vq_loss": 0.000523, "learning_rate": 0.0009579674435701252, "loss": 0.053784, "step": 6790, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.007172, "value_mse_loss_layer_003": 0.010864, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.012634, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.020874, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.023926, "value_mse_loss_layer_019": 0.02417, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.030762, "value_mse_loss_layer_023": 0.038818, "value_mse_loss_layer_024": 0.040283, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.042969, "value_mse_loss_layer_027": 0.057617, "value_mse_loss_layer_028": 0.062012, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.078125, "value_mse_loss_layer_031": 0.084473, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000214, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000239, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000462, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000368, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.00116, "vq_loss_layer_029": 0.001587, "vq_loss_layer_030": 0.003479, "vq_loss_layer_031": 0.006134 }, { "ce_loss": 2.305691, "epoch": 0.0068, "grad_norm": 0.0023870256263762712, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.053, "kv_vq_loss": 0.000501, "learning_rate": 0.0009581272281765589, "loss": 0.053494, "step": 6800, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.011597, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.01178, "value_mse_loss_layer_007": 0.012695, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.017212, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.020752, "value_mse_loss_layer_019": 0.030518, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.029907, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000351, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000303, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000393, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.343682, "epoch": 0.00681, "grad_norm": 0.0032509281300008297, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.11084, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.053494, "kv_vq_loss": 0.000522, "learning_rate": 0.0009582867779781962, "loss": 0.054013, "step": 6810, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001862, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.011047, "value_mse_loss_layer_005": 0.010681, "value_mse_loss_layer_006": 0.012939, "value_mse_loss_layer_007": 0.013245, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.018311, "value_mse_loss_layer_012": 0.020386, "value_mse_loss_layer_013": 0.021362, "value_mse_loss_layer_014": 0.021484, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.023315, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.026245, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000233, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000273, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.000393, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000511, "vq_loss_layer_016": 0.000469, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000229, "vq_loss_layer_020": 0.000296, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000372, "vq_loss_layer_024": 0.000479, "vq_loss_layer_025": 0.000599, "vq_loss_layer_026": 0.00069, "vq_loss_layer_027": 0.000813, "vq_loss_layer_028": 0.001465, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.312248, "epoch": 0.00682, "grad_norm": 0.00344414496794343, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.053754, "kv_vq_loss": 0.000547, "learning_rate": 0.0009584460936641196, "loss": 0.054294, "step": 6820, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001808, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.010864, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012634, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2.6e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000238, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000263, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000423, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000523, "vq_loss_layer_015": 0.00046, "vq_loss_layer_016": 0.000488, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000238, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000622, "vq_loss_layer_022": 0.000351, "vq_loss_layer_023": 0.000511, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000542, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000866, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.005096, "vq_loss_layer_031": 0.006378 }, { "ce_loss": 2.268702, "epoch": 0.00683, "grad_norm": 0.0030334386974573135, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.05127, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.053159, "kv_vq_loss": 0.000525, "learning_rate": 0.0009586051759203831, "loss": 0.053674, "step": 6830, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.012512, "value_mse_loss_layer_008": 0.015991, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.021484, "value_mse_loss_layer_015": 0.022949, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.022949, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.022949, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.071777, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000241, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000397, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000463, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000473, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000368, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000957, "vq_loss_layer_029": 0.001183, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.326499, "epoch": 0.00684, "grad_norm": 0.003094663843512535, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.053519, "kv_vq_loss": 0.000513, "learning_rate": 0.0009587640254300289, "loss": 0.054031, "step": 6840, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.011597, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.012451, "value_mse_loss_layer_007": 0.013245, "value_mse_loss_layer_008": 0.015503, "value_mse_loss_layer_009": 0.020996, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.018677, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.025269, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.024292, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.026978, "value_mse_loss_layer_020": 0.027588, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.029419, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.055908, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 0.000113, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.000221, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.00046, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.000954, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.289806, "epoch": 0.00685, "grad_norm": 0.00386934750713408, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.089844, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.053378, "kv_vq_loss": 0.000523, "learning_rate": 0.0009589226428731063, "loss": 0.053891, "step": 6850, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001839, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.015442, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.017456, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.026489, "value_mse_loss_layer_021": 0.028687, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.057861, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.072754, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000694, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001495, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.350279, "epoch": 0.00686, "grad_norm": 0.0025320183485746384, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.053113, "kv_vq_loss": 0.000522, "learning_rate": 0.0009590810289266878, "loss": 0.053629, "step": 6860, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001846, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.01001, "value_mse_loss_layer_005": 0.01001, "value_mse_loss_layer_006": 0.011841, "value_mse_loss_layer_007": 0.01355, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.019165, "value_mse_loss_layer_013": 0.020996, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.050781, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.072754, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000222, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.000423, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000471, "vq_loss_layer_015": 0.000467, "vq_loss_layer_016": 0.000444, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000221, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000414, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.004395 }, { "ce_loss": 2.287244, "epoch": 0.00687, "grad_norm": 0.003057490335777402, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.098633, "key_mse_loss_layer_010": 0.111328, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.137695, "key_mse_loss_layer_014": 0.134766, "key_mse_loss_layer_015": 0.123047, "key_mse_loss_layer_016": 0.117188, "key_mse_loss_layer_017": 0.115723, "key_mse_loss_layer_018": 0.123047, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.113281, "key_mse_loss_layer_021": 0.107422, "key_mse_loss_layer_022": 0.114746, "key_mse_loss_layer_023": 0.109375, "key_mse_loss_layer_024": 0.087891, "key_mse_loss_layer_025": 0.083496, "key_mse_loss_layer_026": 0.098633, "key_mse_loss_layer_027": 0.09668, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.102051, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.053525, "kv_vq_loss": 0.000531, "learning_rate": 0.0009592391842648874, "loss": 0.054053, "step": 6870, "value_mse_loss_layer_000": 0.000633, "value_mse_loss_layer_001": 0.0019, "value_mse_loss_layer_002": 0.008057, "value_mse_loss_layer_003": 0.011719, "value_mse_loss_layer_004": 0.011719, "value_mse_loss_layer_005": 0.010071, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.02124, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.073242, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2.7e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 0.000122, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000185, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000237, "vq_loss_layer_009": 0.000305, "vq_loss_layer_010": 0.00028, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000546, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000538, "vq_loss_layer_022": 0.000376, "vq_loss_layer_023": 0.00046, "vq_loss_layer_024": 0.000671, "vq_loss_layer_025": 0.00071, "vq_loss_layer_026": 0.000729, "vq_loss_layer_027": 0.000835, "vq_loss_layer_028": 0.001343, "vq_loss_layer_029": 0.001343, "vq_loss_layer_030": 0.003067, "vq_loss_layer_031": 0.006073 }, { "ce_loss": 2.310751, "epoch": 0.00688, "grad_norm": 0.0034413582179695368, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.095215, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.053586, "kv_vq_loss": 0.00052, "learning_rate": 0.0009593971095588776, "loss": 0.054099, "step": 6880, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001884, "value_mse_loss_layer_002": 0.00705, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.01062, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012939, "value_mse_loss_layer_008": 0.015503, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.022949, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.022339, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.030029, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.043945, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.07959, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 0.000107, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.00021, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.000439, "vq_loss_layer_013": 0.000381, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.270655, "epoch": 0.00689, "grad_norm": 0.0024891220964491367, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.053137, "kv_vq_loss": 0.000507, "learning_rate": 0.0009595548054769064, "loss": 0.05365, "step": 6890, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009644, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.015198, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.018677, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.02417, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.033691, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.043457, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000286, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.000222, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000973, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.356356, "epoch": 0.0069, "grad_norm": 0.004145320970565081, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.053171, "kv_vq_loss": 0.000517, "learning_rate": 0.0009597122726843138, "loss": 0.053687, "step": 6900, "value_mse_loss_layer_000": 0.000641, "value_mse_loss_layer_001": 0.001877, "value_mse_loss_layer_002": 0.006592, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.012512, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.017212, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.022095, "value_mse_loss_layer_015": 0.02417, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.026123, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.026001, "value_mse_loss_layer_021": 0.030151, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.041748, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.088867, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000201, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000254, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000362, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000479, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000422, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001297, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.325251, "epoch": 0.00691, "grad_norm": 0.0025111911818385124, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.053156, "kv_vq_loss": 0.000508, "learning_rate": 0.0009598695118435494, "loss": 0.053662, "step": 6910, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001839, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.010742, "value_mse_loss_layer_005": 0.01001, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.025879, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.023315, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.02478, "value_mse_loss_layer_021": 0.029663, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.0354, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000267, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.000889, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000406, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.00034, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.278603, "epoch": 0.00692, "grad_norm": 0.0019623192492872477, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.052905, "kv_vq_loss": 0.0005, "learning_rate": 0.0009600265236141894, "loss": 0.053406, "step": 6920, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001862, "value_mse_loss_layer_002": 0.007019, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.010376, "value_mse_loss_layer_006": 0.011841, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.015503, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.024658, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.023071, "value_mse_loss_layer_020": 0.024414, "value_mse_loss_layer_021": 0.029907, "value_mse_loss_layer_022": 0.029419, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.043457, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000226, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.000441, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000496, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000212, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000338, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000553, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.000778, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.005096 }, { "ce_loss": 2.288533, "epoch": 0.00693, "grad_norm": 0.0023864987306296825, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.053403, "kv_vq_loss": 0.000509, "learning_rate": 0.0009601833086529516, "loss": 0.053915, "step": 6930, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001854, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012695, "value_mse_loss_layer_008": 0.015259, "value_mse_loss_layer_009": 0.020996, "value_mse_loss_layer_010": 0.018433, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.022461, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.024536, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.025024, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.068359, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000286, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000412, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000443, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.004425 }, { "ce_loss": 2.294516, "epoch": 0.00694, "grad_norm": 0.0028755974490195513, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.055176, "key_mse_loss_layer_004": 0.064941, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.053214, "kv_vq_loss": 0.000517, "learning_rate": 0.0009603398676137136, "loss": 0.05372, "step": 6940, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.01123, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.018555, "value_mse_loss_layer_012": 0.018921, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.022217, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.025024, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.030762, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.049805, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000397, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000431, "vq_loss_layer_017": 0.000402, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.260016, "epoch": 0.00695, "grad_norm": 0.0026214460376650095, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052972, "kv_vq_loss": 0.000509, "learning_rate": 0.0009604962011475283, "loss": 0.053488, "step": 6950, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001854, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.010437, "value_mse_loss_layer_005": 0.010925, "value_mse_loss_layer_006": 0.012268, "value_mse_loss_layer_007": 0.013123, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.019775, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.022095, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.029663, "value_mse_loss_layer_022": 0.029907, "value_mse_loss_layer_023": 0.042969, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.047607, "value_mse_loss_layer_026": 0.041016, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.078613, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 0.000105, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000224, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000238, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.00046, "vq_loss_layer_016": 0.000441, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000305, "vq_loss_layer_019": 0.000227, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000423, "vq_loss_layer_024": 0.000336, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.001587, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.006256 }, { "ce_loss": 2.327921, "epoch": 0.00696, "grad_norm": 0.002397931879386306, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.053146, "kv_vq_loss": 0.000501, "learning_rate": 0.0009606523099026404, "loss": 0.053653, "step": 6960, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.006683, "value_mse_loss_layer_003": 0.01123, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.01178, "value_mse_loss_layer_007": 0.012634, "value_mse_loss_layer_008": 0.015259, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.018188, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.019409, "value_mse_loss_layer_017": 0.022827, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.029785, "value_mse_loss_layer_022": 0.029175, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.039062, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.05542, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000286, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000439, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.000504, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.000221, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000843, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.291244, "epoch": 0.00697, "grad_norm": 0.004650235641747713, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.085938, "key_mse_loss_layer_023": 0.083496, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.064453, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.052789, "kv_vq_loss": 0.000507, "learning_rate": 0.0009608081945245021, "loss": 0.053296, "step": 6970, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006622, "value_mse_loss_layer_003": 0.010437, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.023682, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.029907, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.041748, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.089355, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.066895, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000389, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.004547 }, { "ce_loss": 2.241803, "epoch": 0.00698, "grad_norm": 0.0020322666969150305, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.061523, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.053271, "kv_vq_loss": 0.000517, "learning_rate": 0.00096096385565579, "loss": 0.053796, "step": 6980, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006622, "value_mse_loss_layer_003": 0.010864, "value_mse_loss_layer_004": 0.009277, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.020142, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.025146, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000439, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000393, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.273247, "epoch": 0.00699, "grad_norm": 0.003003440098837018, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.091309, "key_mse_loss_layer_009": 0.09668, "key_mse_loss_layer_010": 0.109375, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.132812, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.107422, "key_mse_loss_layer_017": 0.109375, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.053229, "kv_vq_loss": 0.000523, "learning_rate": 0.0009611192939364202, "loss": 0.05376, "step": 6990, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006897, "value_mse_loss_layer_003": 0.011047, "value_mse_loss_layer_004": 0.011353, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.013184, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.018921, "value_mse_loss_layer_011": 0.018433, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.074219, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 0.000115, "vq_loss_layer_005": 0.000104, "vq_loss_layer_006": 0.000235, "vq_loss_layer_007": 0.000204, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000275, "vq_loss_layer_010": 0.00033, "vq_loss_layer_011": 0.000305, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000496, "vq_loss_layer_015": 0.000511, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000557, "vq_loss_layer_022": 0.00042, "vq_loss_layer_023": 0.000515, "vq_loss_layer_024": 0.000542, "vq_loss_layer_025": 0.000774, "vq_loss_layer_026": 0.000683, "vq_loss_layer_027": 0.000874, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.006897 }, { "ce_loss": 2.275398, "epoch": 0.007, "grad_norm": 0.0021366931032389402, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.05296, "kv_vq_loss": 0.000503, "learning_rate": 0.000961274510003564, "loss": 0.053464, "step": 7000, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006683, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.009705, "value_mse_loss_layer_005": 0.009705, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.015564, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.024048, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.028687, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001518, "vq_loss_layer_030": 0.002609, "vq_loss_layer_031": 0.005096 }, { "ce_loss": 2.349035, "epoch": 0.00701, "grad_norm": 0.002746038604527712, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.052972, "kv_vq_loss": 0.000481, "learning_rate": 0.0009614295044916645, "loss": 0.053467, "step": 7010, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.01001, "value_mse_loss_layer_006": 0.01178, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.018677, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.02417, "value_mse_loss_layer_018": 0.022949, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.029175, "value_mse_loss_layer_022": 0.028809, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.048096, "value_mse_loss_layer_026": 0.039307, "value_mse_loss_layer_027": 0.054443, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.071289, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000191, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000238, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000391, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000395, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.00069, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.322873, "epoch": 0.00702, "grad_norm": 0.0029514136258512735, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.05347, "kv_vq_loss": 0.000518, "learning_rate": 0.0009615842780324513, "loss": 0.053992, "step": 7020, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.013123, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015625, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.021729, "value_mse_loss_layer_015": 0.024902, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.023682, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.025146, "value_mse_loss_layer_021": 0.037598, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.053467, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000186, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000208, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000462, "vq_loss_layer_016": 0.00046, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000492, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000698, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.003891 }, { "ce_loss": 2.294527, "epoch": 0.00703, "grad_norm": 0.0023497098591178656, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.053009, "kv_vq_loss": 0.000502, "learning_rate": 0.0009617388312549559, "loss": 0.053519, "step": 7030, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001808, "value_mse_loss_layer_002": 0.006866, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.010498, "value_mse_loss_layer_006": 0.012085, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015198, "value_mse_loss_layer_009": 0.02124, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.022827, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.025024, "value_mse_loss_layer_021": 0.028687, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000448, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000222, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.354671, "epoch": 0.00704, "grad_norm": 0.002882513450458646, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.052594, "kv_vq_loss": 0.000502, "learning_rate": 0.0009618931647855279, "loss": 0.053098, "step": 7040, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001854, "value_mse_loss_layer_002": 0.006683, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.01001, "value_mse_loss_layer_006": 0.011841, "value_mse_loss_layer_007": 0.012573, "value_mse_loss_layer_008": 0.016235, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.02417, "value_mse_loss_layer_020": 0.024902, "value_mse_loss_layer_021": 0.030273, "value_mse_loss_layer_022": 0.029663, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.046143, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000239, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000416, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.295654, "epoch": 0.00705, "grad_norm": 0.0029813528526574373, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.052927, "kv_vq_loss": 0.000493, "learning_rate": 0.0009620472792478496, "loss": 0.053424, "step": 7050, "value_mse_loss_layer_000": 0.000626, "value_mse_loss_layer_001": 0.001846, "value_mse_loss_layer_002": 0.006958, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.009644, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.012817, "value_mse_loss_layer_008": 0.015564, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.020874, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.020752, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.02478, "value_mse_loss_layer_021": 0.029175, "value_mse_loss_layer_022": 0.030029, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.043945, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.049561, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.078613, "value_mse_loss_layer_031": 0.073242, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000202, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000234, "vq_loss_layer_012": 0.000395, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000223, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000296, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.283652, "epoch": 0.00706, "grad_norm": 0.0029703436885029078, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.062012, "kv_mse_loss": 0.053296, "kv_vq_loss": 0.000533, "learning_rate": 0.0009622011752629507, "loss": 0.053821, "step": 7060, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006927, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.011902, "value_mse_loss_layer_007": 0.012573, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.020508, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.025146, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.048584, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.05542, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000244, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000267, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000456, "vq_loss_layer_015": 0.000496, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000483, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000349, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.290234, "epoch": 0.00707, "grad_norm": 0.0024940017610788345, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.053244, "kv_vq_loss": 0.000512, "learning_rate": 0.0009623548534492248, "loss": 0.053745, "step": 7070, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.012268, "value_mse_loss_layer_007": 0.012329, "value_mse_loss_layer_008": 0.015198, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.019775, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.020142, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.020142, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.024902, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.043457, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000165, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000278, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000504, "vq_loss_layer_013": 0.000347, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000443, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.000935, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.302897, "epoch": 0.00708, "grad_norm": 0.002203810727223754, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.053311, "kv_vq_loss": 0.000501, "learning_rate": 0.0009625083144224421, "loss": 0.053821, "step": 7080, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001839, "value_mse_loss_layer_002": 0.007568, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.009705, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.013184, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017944, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.023193, "value_mse_loss_layer_018": 0.02356, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.025269, "value_mse_loss_layer_021": 0.030029, "value_mse_loss_layer_022": 0.030029, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.043701, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000207, "vq_loss_layer_008": 0.000231, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000429, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000854, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.287686, "epoch": 0.00709, "grad_norm": 0.0023501405958086252, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052731, "kv_vq_loss": 0.000493, "learning_rate": 0.0009626615587957666, "loss": 0.053232, "step": 7090, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001808, "value_mse_loss_layer_002": 0.006439, "value_mse_loss_layer_003": 0.010193, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019409, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.023438, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.028687, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.066406, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000486, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003647 }, { "ce_loss": 2.310765, "epoch": 0.0071, "grad_norm": 0.0027947332710027695, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052808, "kv_vq_loss": 0.000495, "learning_rate": 0.0009628145871797687, "loss": 0.053311, "step": 7100, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.00647, "value_mse_loss_layer_003": 0.012695, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.012207, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018677, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.043457, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.050781, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.068359, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000439, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.310979, "epoch": 0.00711, "grad_norm": 0.0027518603019416332, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.064453, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.052762, "kv_vq_loss": 0.000501, "learning_rate": 0.0009629674001824416, "loss": 0.053265, "step": 7110, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.008789, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012939, "value_mse_loss_layer_008": 0.015381, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.018066, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.030029, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.048096, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.072266, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.066895, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 8.3e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000224, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000477, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.293055, "epoch": 0.00712, "grad_norm": 0.0027259215712547302, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052884, "kv_vq_loss": 0.000497, "learning_rate": 0.0009631199984092139, "loss": 0.053381, "step": 7120, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001953, "value_mse_loss_layer_002": 0.007202, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.023193, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.02478, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.048096, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.076172, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.00041, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.291475, "epoch": 0.00713, "grad_norm": 0.002842891728505492, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.109863, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.116211, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.10791, "key_mse_loss_layer_021": 0.102051, "key_mse_loss_layer_022": 0.106445, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.098633, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.053116, "kv_vq_loss": 0.000491, "learning_rate": 0.0009632723824629663, "loss": 0.053619, "step": 7130, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006622, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.011597, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022949, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000839, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.256153, "epoch": 0.00714, "grad_norm": 0.0021874012891203165, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.053351, "kv_vq_loss": 0.000509, "learning_rate": 0.0009634245529440435, "loss": 0.05387, "step": 7140, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001854, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.015503, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.020752, "value_mse_loss_layer_013": 0.020874, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.022827, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.025146, "value_mse_loss_layer_021": 0.03064, "value_mse_loss_layer_022": 0.03064, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.052002, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.07373, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000203, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.0005, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000416, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.310112, "epoch": 0.00715, "grad_norm": 0.0033279648050665855, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052853, "kv_vq_loss": 0.000505, "learning_rate": 0.0009635765104502701, "loss": 0.053354, "step": 7150, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006836, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.020386, "value_mse_loss_layer_019": 0.024292, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.029663, "value_mse_loss_layer_022": 0.028198, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.046143, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.083984, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.003845 }, { "ce_loss": 2.26885, "epoch": 0.00716, "grad_norm": 0.001999992411583662, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.062012, "kv_mse_loss": 0.053406, "kv_vq_loss": 0.000518, "learning_rate": 0.0009637282555769637, "loss": 0.053922, "step": 7160, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001808, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.020386, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.028687, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.00021, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000359, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000322, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.003784 }, { "ce_loss": 2.289523, "epoch": 0.00717, "grad_norm": 0.002309533767402172, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.052759, "kv_vq_loss": 0.000517, "learning_rate": 0.0009638797889169501, "loss": 0.053265, "step": 7170, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006958, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.012207, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.02356, "value_mse_loss_layer_017": 0.022461, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.030518, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.055908, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.068359, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000232, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000362, "vq_loss_layer_014": 0.000458, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000542, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000418, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002792, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.289593, "epoch": 0.00718, "grad_norm": 0.002680712379515171, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.052548, "kv_vq_loss": 0.0005, "learning_rate": 0.000964031111060575, "loss": 0.053058, "step": 7180, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006683, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.009705, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012573, "value_mse_loss_layer_008": 0.015442, "value_mse_loss_layer_009": 0.020752, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.023193, "value_mse_loss_layer_013": 0.020752, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.024536, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.037842, "value_mse_loss_layer_027": 0.046143, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.00023, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000257, "vq_loss_layer_012": 0.000736, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000463, "vq_loss_layer_015": 0.000475, "vq_loss_layer_016": 0.000469, "vq_loss_layer_017": 0.000454, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000238, "vq_loss_layer_020": 0.000343, "vq_loss_layer_021": 0.000504, "vq_loss_layer_022": 0.000353, "vq_loss_layer_023": 0.000374, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000488, "vq_loss_layer_026": 0.000706, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.005371 }, { "ce_loss": 2.282596, "epoch": 0.00719, "grad_norm": 0.004807814955711365, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.052484, "kv_vq_loss": 0.000492, "learning_rate": 0.0009641822225957206, "loss": 0.052982, "step": 7190, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.01178, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.026611, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.03064, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.063965, "value_mse_loss_layer_029": 0.114258, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.074707, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000201, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000427, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.001381, "vq_loss_layer_029": 0.002182, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.332735, "epoch": 0.0072, "grad_norm": 0.002899626735597849, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.053015, "kv_vq_loss": 0.000515, "learning_rate": 0.000964333124107817, "loss": 0.053537, "step": 7200, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.012024, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.017456, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.029175, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.054932, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.073242, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000273, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001343, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.275815, "epoch": 0.00721, "grad_norm": 0.0025803600437939167, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.052808, "kv_vq_loss": 0.000493, "learning_rate": 0.000964483816179857, "loss": 0.053305, "step": 7210, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006592, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.013, "value_mse_loss_layer_008": 0.015198, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.023193, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.026001, "value_mse_loss_layer_021": 0.030273, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.047852, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.051025, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.077637, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000209, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000385, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.004303 }, { "ce_loss": 2.294954, "epoch": 0.00722, "grad_norm": 0.002573831472545862, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.052969, "kv_vq_loss": 0.000501, "learning_rate": 0.0009646342993924098, "loss": 0.053476, "step": 7220, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001808, "value_mse_loss_layer_002": 0.006561, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.011902, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.019531, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020142, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.026001, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.028687, "value_mse_loss_layer_022": 0.033203, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.037842, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000467, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.249144, "epoch": 0.00723, "grad_norm": 0.002209831727668643, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.040771, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.114746, "key_mse_loss_layer_016": 0.109863, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.053149, "kv_vq_loss": 0.000506, "learning_rate": 0.0009647845743236326, "loss": 0.053659, "step": 7230, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.011353, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.012207, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000444, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000431, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.000479, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.000393, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.000614, "vq_loss_layer_027": 0.000694, "vq_loss_layer_028": 0.001656, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.005219 }, { "ce_loss": 2.249805, "epoch": 0.00724, "grad_norm": 0.0033728156704455614, "key_mse_loss_layer_000": 0.006409, "key_mse_loss_layer_001": 0.013672, "key_mse_loss_layer_002": 0.0625, "key_mse_loss_layer_003": 0.053955, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.075684, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.089844, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.102539, "key_mse_loss_layer_030": 0.103516, "key_mse_loss_layer_031": 0.086426, "kv_mse_loss": 0.052872, "kv_vq_loss": 0.000521, "learning_rate": 0.0009649346415492867, "loss": 0.053384, "step": 7240, "value_mse_loss_layer_000": 0.000645, "value_mse_loss_layer_001": 0.001869, "value_mse_loss_layer_002": 0.006897, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.010925, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.012085, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.019409, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.023071, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.026245, "value_mse_loss_layer_021": 0.042236, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.039795, "value_mse_loss_layer_025": 0.045898, "value_mse_loss_layer_026": 0.057373, "value_mse_loss_layer_027": 0.052002, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.084473, "value_mse_loss_layer_031": 0.081055, "vq_loss_layer_000": 9e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000236, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000471, "vq_loss_layer_015": 0.000471, "vq_loss_layer_016": 0.000456, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000307, "vq_loss_layer_019": 0.000233, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000736, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000568, "vq_loss_layer_026": 0.001503, "vq_loss_layer_027": 0.000725, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.00235, "vq_loss_layer_030": 0.004395, "vq_loss_layer_031": 0.006378 }, { "ce_loss": 2.298434, "epoch": 0.00725, "grad_norm": 0.002699207980185747, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.041016, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.132812, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.115723, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.052634, "kv_vq_loss": 0.000495, "learning_rate": 0.0009650845016427483, "loss": 0.053137, "step": 7250, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.011902, "value_mse_loss_layer_004": 0.010376, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.011841, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.014526, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 4.6e-05, "vq_loss_layer_004": 8.6e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000225, "vq_loss_layer_009": 0.000288, "vq_loss_layer_010": 0.000319, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000486, "vq_loss_layer_015": 0.000481, "vq_loss_layer_016": 0.000467, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000557, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000511, "vq_loss_layer_024": 0.000374, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.005371 }, { "ce_loss": 2.295563, "epoch": 0.00726, "grad_norm": 0.003028400707989931, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.106934, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.053, "kv_vq_loss": 0.000502, "learning_rate": 0.0009652341551750235, "loss": 0.053513, "step": 7260, "value_mse_loss_layer_000": 0.000614, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.011597, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.020142, "value_mse_loss_layer_015": 0.02356, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.023315, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.024292, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.030884, "value_mse_loss_layer_022": 0.030151, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.040771, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000401, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000374, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000877, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001907, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.323547, "epoch": 0.00727, "grad_norm": 0.0022953986190259457, "key_mse_loss_layer_000": 0.004517, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.052612, "kv_vq_loss": 0.000487, "learning_rate": 0.0009653836027147592, "loss": 0.053113, "step": 7270, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001839, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.01123, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.009827, "value_mse_loss_layer_006": 0.011414, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.015503, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.026123, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.03064, "value_mse_loss_layer_022": 0.031982, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.043945, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.05127, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000211, "vq_loss_layer_008": 0.000198, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.278396, "epoch": 0.00728, "grad_norm": 0.0033524741884320974, "key_mse_loss_layer_000": 0.003784, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.052811, "kv_vq_loss": 0.000508, "learning_rate": 0.0009655328448282591, "loss": 0.053333, "step": 7280, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.008423, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.02063, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.02417, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.028442, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.074707, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.07959, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000218, "vq_loss_layer_009": 0.000343, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.001328, "vq_loss_layer_030": 0.00473, "vq_loss_layer_031": 0.005493 }, { "ce_loss": 2.255776, "epoch": 0.00729, "grad_norm": 0.0030824746936559677, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.052902, "kv_vq_loss": 0.000524, "learning_rate": 0.0009656818820794935, "loss": 0.053415, "step": 7290, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006592, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.013, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.02063, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.023804, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.025269, "value_mse_loss_layer_021": 0.035156, "value_mse_loss_layer_022": 0.029175, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.056641, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.068359, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.00021, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000448, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000301, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.276868, "epoch": 0.0073, "grad_norm": 0.0023860016372054815, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.05274, "kv_vq_loss": 0.000498, "learning_rate": 0.0009658307150301139, "loss": 0.053244, "step": 7300, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.006561, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.011414, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.020874, "value_mse_loss_layer_015": 0.023071, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.030884, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.066406, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.000422, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.003479 }, { "ce_loss": 2.314801, "epoch": 0.00731, "grad_norm": 0.002661355771124363, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.072754, "key_mse_loss_layer_031": 0.057129, "kv_mse_loss": 0.052563, "kv_vq_loss": 0.000484, "learning_rate": 0.000965979344239465, "loss": 0.053064, "step": 7310, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.006622, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.010376, "value_mse_loss_layer_005": 0.010864, "value_mse_loss_layer_006": 0.011902, "value_mse_loss_layer_007": 0.012878, "value_mse_loss_layer_008": 0.015869, "value_mse_loss_layer_009": 0.021729, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.018677, "value_mse_loss_layer_012": 0.022095, "value_mse_loss_layer_013": 0.022827, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.025513, "value_mse_loss_layer_016": 0.020386, "value_mse_loss_layer_017": 0.025635, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.026489, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.031128, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.053955, "value_mse_loss_layer_028": 0.057617, "value_mse_loss_layer_029": 0.078613, "value_mse_loss_layer_030": 0.076172, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 4.4e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 0.000103, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000231, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000278, "vq_loss_layer_011": 0.000271, "vq_loss_layer_012": 0.000557, "vq_loss_layer_013": 0.00042, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.000439, "vq_loss_layer_017": 0.000467, "vq_loss_layer_018": 0.00022, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000355, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.000341, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.301981, "epoch": 0.00732, "grad_norm": 0.0021995201241225004, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.052362, "kv_vq_loss": 0.000486, "learning_rate": 0.0009661277702645979, "loss": 0.052859, "step": 7320, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006683, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.013306, "value_mse_loss_layer_008": 0.015137, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.02124, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.030151, "value_mse_loss_layer_022": 0.029419, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.068359, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000244, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000471, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.279389, "epoch": 0.00733, "grad_norm": 0.002168400678783655, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.052744, "kv_vq_loss": 0.000489, "learning_rate": 0.0009662759936602817, "loss": 0.053244, "step": 7330, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.007141, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.01001, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012634, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.018188, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020142, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.024048, "value_mse_loss_layer_020": 0.024902, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000206, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000228, "vq_loss_layer_011": 0.000315, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000463, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000732, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.2819, "epoch": 0.00734, "grad_norm": 0.0031706797890365124, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.052734, "kv_vq_loss": 0.000489, "learning_rate": 0.0009664240149790175, "loss": 0.053232, "step": 7340, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006622, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.012207, "value_mse_loss_layer_008": 0.015747, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.017456, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.028931, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.075195, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.07373, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.004242 }, { "ce_loss": 2.289077, "epoch": 0.00735, "grad_norm": 0.0022759304847568274, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.052631, "kv_vq_loss": 0.000512, "learning_rate": 0.0009665718347710486, "loss": 0.053143, "step": 7350, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001808, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.009705, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.019409, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.024292, "value_mse_loss_layer_016": 0.018555, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.021606, "value_mse_loss_layer_019": 0.023438, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.031128, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000429, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000412, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.306245, "epoch": 0.00736, "grad_norm": 0.00229098298586905, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.052579, "kv_vq_loss": 0.000482, "learning_rate": 0.0009667194535843745, "loss": 0.053085, "step": 7360, "value_mse_loss_layer_000": 0.000622, "value_mse_loss_layer_001": 0.001846, "value_mse_loss_layer_002": 0.006958, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012451, "value_mse_loss_layer_008": 0.015137, "value_mse_loss_layer_009": 0.019409, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.025269, "value_mse_loss_layer_021": 0.029175, "value_mse_loss_layer_022": 0.028809, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.000254, "vq_loss_layer_012": 0.000397, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.00023, "vq_loss_layer_021": 0.000448, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.00034, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000771, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.005096 }, { "ce_loss": 2.296576, "epoch": 0.00737, "grad_norm": 0.0035736628342419863, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.052676, "kv_vq_loss": 0.000501, "learning_rate": 0.0009668668719647628, "loss": 0.05318, "step": 7370, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006683, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.01001, "value_mse_loss_layer_005": 0.009644, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.019653, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.023071, "value_mse_loss_layer_016": 0.021606, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.025146, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.043945, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000492, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000473, "vq_loss_layer_017": 0.000387, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.279524, "epoch": 0.00738, "grad_norm": 0.00230482523329556, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.103516, "key_mse_loss_layer_024": 0.084961, "key_mse_loss_layer_025": 0.081055, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.095703, "key_mse_loss_layer_028": 0.101562, "key_mse_loss_layer_029": 0.095703, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.053223, "kv_vq_loss": 0.000521, "learning_rate": 0.0009670140904557602, "loss": 0.053741, "step": 7380, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.007019, "value_mse_loss_layer_003": 0.011047, "value_mse_loss_layer_004": 0.010803, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.012268, "value_mse_loss_layer_007": 0.013184, "value_mse_loss_layer_008": 0.016113, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.018921, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.019409, "value_mse_loss_layer_017": 0.022827, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.027954, "value_mse_loss_layer_021": 0.030762, "value_mse_loss_layer_022": 0.030396, "value_mse_loss_layer_023": 0.043701, "value_mse_loss_layer_024": 0.04248, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.047852, "value_mse_loss_layer_027": 0.056641, "value_mse_loss_layer_028": 0.061279, "value_mse_loss_layer_029": 0.084961, "value_mse_loss_layer_030": 0.07959, "value_mse_loss_layer_031": 0.078125, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000197, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000282, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000301, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000446, "vq_loss_layer_013": 0.000433, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000446, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000234, "vq_loss_layer_020": 0.000389, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000483, "vq_loss_layer_024": 0.000626, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.000851, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.001869, "vq_loss_layer_030": 0.003433, "vq_loss_layer_031": 0.00592 }, { "ce_loss": 2.301577, "epoch": 0.00739, "grad_norm": 0.0020664557814598083, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.052621, "kv_vq_loss": 0.000491, "learning_rate": 0.0009671611095987064, "loss": 0.053119, "step": 7390, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001778, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.012207, "value_mse_loss_layer_008": 0.015137, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.022949, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.023438, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.029785, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000372, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.312302, "epoch": 0.0074, "grad_norm": 0.0021372323390096426, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.052249, "kv_vq_loss": 0.000475, "learning_rate": 0.0009673079299327439, "loss": 0.052744, "step": 7400, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.010864, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.011902, "value_mse_loss_layer_007": 0.012329, "value_mse_loss_layer_008": 0.015137, "value_mse_loss_layer_009": 0.02002, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.024414, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.028809, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000412, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.339872, "epoch": 0.00741, "grad_norm": 0.0021753953769803047, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.052802, "kv_vq_loss": 0.00049, "learning_rate": 0.0009674545519948319, "loss": 0.053308, "step": 7410, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006622, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.011597, "value_mse_loss_layer_007": 0.012756, "value_mse_loss_layer_008": 0.01532, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.020386, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.02356, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.065918, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000265, "vq_loss_layer_010": 0.000244, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000463, "vq_loss_layer_016": 0.000422, "vq_loss_layer_017": 0.000446, "vq_loss_layer_018": 0.000222, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000292, "vq_loss_layer_021": 0.000422, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000372, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.003036, "vq_loss_layer_031": 0.004608 }, { "ce_loss": 2.294094, "epoch": 0.00742, "grad_norm": 0.0025780454743653536, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.052246, "kv_vq_loss": 0.000497, "learning_rate": 0.0009676009763197566, "loss": 0.05275, "step": 7420, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001839, "value_mse_loss_layer_002": 0.00647, "value_mse_loss_layer_003": 0.011047, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.014771, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.021606, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.021973, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000378, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000368, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.317611, "epoch": 0.00743, "grad_norm": 0.0022711674682796, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.052853, "kv_vq_loss": 0.000501, "learning_rate": 0.0009677472034401437, "loss": 0.053366, "step": 7430, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001778, "value_mse_loss_layer_002": 0.006439, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.011475, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.011414, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.019409, "value_mse_loss_layer_010": 0.016846, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018555, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.030884, "value_mse_loss_layer_022": 0.027344, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.070312, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000201, "vq_loss_layer_009": 0.000235, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.000221, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000496, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.004608 }, { "ce_loss": 2.329644, "epoch": 0.00744, "grad_norm": 0.0023537143133580685, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.052368, "kv_vq_loss": 0.000469, "learning_rate": 0.0009678932338864696, "loss": 0.052863, "step": 7440, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006592, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.013062, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.020264, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.029541, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.00032, "vq_loss_layer_010": 0.000222, "vq_loss_layer_011": 0.000238, "vq_loss_layer_012": 0.000412, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000401, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.303158, "epoch": 0.00745, "grad_norm": 0.002208895981311798, "key_mse_loss_layer_000": 0.003693, "key_mse_loss_layer_001": 0.011353, "key_mse_loss_layer_002": 0.060547, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.124023, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.08252, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.05289, "kv_vq_loss": 0.000503, "learning_rate": 0.000968039068187073, "loss": 0.053403, "step": 7450, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.00705, "value_mse_loss_layer_003": 0.011963, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.011963, "value_mse_loss_layer_007": 0.01239, "value_mse_loss_layer_008": 0.014465, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.024292, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.052246, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.071777, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000156, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.00028, "vq_loss_layer_010": 0.000238, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000444, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000452, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000418, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000292, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.00033, "vq_loss_layer_025": 0.000448, "vq_loss_layer_026": 0.000759, "vq_loss_layer_027": 0.00079, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.003052, "vq_loss_layer_031": 0.005737 }, { "ce_loss": 2.276707, "epoch": 0.00746, "grad_norm": 0.0031116444151848555, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.039307, "key_mse_loss_layer_005": 0.053955, "key_mse_loss_layer_006": 0.05957, "key_mse_loss_layer_007": 0.069336, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.094238, "key_mse_loss_layer_011": 0.093262, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.080078, "key_mse_loss_layer_020": 0.087402, "key_mse_loss_layer_021": 0.084473, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.070312, "key_mse_loss_layer_031": 0.053955, "kv_mse_loss": 0.052768, "kv_vq_loss": 0.000495, "learning_rate": 0.0009681847068681672, "loss": 0.053268, "step": 7460, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.006317, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.015259, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020874, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.018555, "value_mse_loss_layer_017": 0.02478, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.028687, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.051025, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.068359, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000254, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000368, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000465, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000725, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001312, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.00528 }, { "ce_loss": 2.307968, "epoch": 0.00747, "grad_norm": 0.002487928606569767, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052579, "kv_vq_loss": 0.000474, "learning_rate": 0.0009683301504538497, "loss": 0.053076, "step": 7470, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006592, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.009705, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.014709, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.01709, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.02832, "value_mse_loss_layer_022": 0.029175, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000399, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.307711, "epoch": 0.00748, "grad_norm": 0.0027190193068236113, "key_mse_loss_layer_000": 0.003647, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.052829, "kv_vq_loss": 0.000505, "learning_rate": 0.0009684753994661152, "loss": 0.053326, "step": 7480, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.006866, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.013, "value_mse_loss_layer_008": 0.015381, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.027588, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.031128, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.05957, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000216, "vq_loss_layer_008": 0.000223, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.00028, "vq_loss_layer_012": 0.000395, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000431, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000244, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000534, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000637, "vq_loss_layer_027": 0.00079, "vq_loss_layer_028": 0.001282, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.293641, "epoch": 0.00749, "grad_norm": 0.002236344153061509, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.052872, "kv_vq_loss": 0.000492, "learning_rate": 0.0009686204544248664, "loss": 0.053375, "step": 7490, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006866, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.010681, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012451, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.020142, "value_mse_loss_layer_010": 0.017822, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.020386, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.024536, "value_mse_loss_layer_020": 0.025024, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.031128, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 9.4e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000237, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000465, "vq_loss_layer_013": 0.000385, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000422, "vq_loss_layer_017": 0.000448, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000254, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000441, "vq_loss_layer_022": 0.000383, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.003586 }, { "ce_loss": 2.332652, "epoch": 0.0075, "grad_norm": 0.0023990545887500048, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.082031, "kv_mse_loss": 0.052451, "kv_vq_loss": 0.000501, "learning_rate": 0.000968765315847925, "loss": 0.052963, "step": 7500, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.006622, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.011169, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.012451, "value_mse_loss_layer_008": 0.015198, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.028076, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.048828, "value_mse_loss_layer_028": 0.053467, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 0.000118, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000243, "vq_loss_layer_012": 0.000416, "vq_loss_layer_013": 0.000343, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000423, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.295957, "epoch": 0.00751, "grad_norm": 0.002633335767313838, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.05253, "kv_vq_loss": 0.000495, "learning_rate": 0.0009689099842510419, "loss": 0.053027, "step": 7510, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.00647, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.020508, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022949, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.034424, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.036133, "value_mse_loss_layer_025": 0.041748, "value_mse_loss_layer_026": 0.041748, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.066406, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000211, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000395, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000435, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000481, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.326575, "epoch": 0.00752, "grad_norm": 0.002768593840301037, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.052621, "kv_vq_loss": 0.000483, "learning_rate": 0.0009690544601479105, "loss": 0.053128, "step": 7520, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006836, "value_mse_loss_layer_003": 0.010437, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009644, "value_mse_loss_layer_006": 0.011658, "value_mse_loss_layer_007": 0.012512, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020874, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.022827, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.024536, "value_mse_loss_layer_020": 0.026367, "value_mse_loss_layer_021": 0.030762, "value_mse_loss_layer_022": 0.031494, "value_mse_loss_layer_023": 0.034424, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.055664, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.00021, "vq_loss_layer_009": 0.000267, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000254, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000433, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000422, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.00071, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.003937, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.303976, "epoch": 0.00753, "grad_norm": 0.0020450877491384745, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.052429, "kv_vq_loss": 0.000493, "learning_rate": 0.0009691987440501751, "loss": 0.052921, "step": 7530, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.022949, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.025269, "value_mse_loss_layer_020": 0.024902, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.036621, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.066895, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.7e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000418, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.003616 }, { "ce_loss": 2.294802, "epoch": 0.00754, "grad_norm": 0.0027179596945643425, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.052469, "kv_vq_loss": 0.000479, "learning_rate": 0.0009693428364674434, "loss": 0.052966, "step": 7540, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.007202, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.012207, "value_mse_loss_layer_007": 0.012207, "value_mse_loss_layer_008": 0.015137, "value_mse_loss_layer_009": 0.020874, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.018555, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.023682, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000158, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000198, "vq_loss_layer_009": 0.000298, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000454, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000349, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001564, "vq_loss_layer_031": 0.003754 }, { "ce_loss": 2.28185, "epoch": 0.00755, "grad_norm": 0.0027199205942451954, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.052554, "kv_vq_loss": 0.000517, "learning_rate": 0.0009694867379072969, "loss": 0.053067, "step": 7550, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006531, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014526, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.017578, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.023804, "value_mse_loss_layer_016": 0.019409, "value_mse_loss_layer_017": 0.024902, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.026733, "value_mse_loss_layer_021": 0.029175, "value_mse_loss_layer_022": 0.03064, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000471, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000243, "vq_loss_layer_021": 0.000389, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.001183, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.310199, "epoch": 0.00756, "grad_norm": 0.00234777107834816, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05271, "kv_vq_loss": 0.000496, "learning_rate": 0.0009696304488753015, "loss": 0.05321, "step": 7560, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001755, "value_mse_loss_layer_002": 0.006256, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014587, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.02002, "value_mse_loss_layer_017": 0.022461, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000234, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000437, "vq_loss_layer_013": 0.000343, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000448, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000931, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.29524, "epoch": 0.00757, "grad_norm": 0.003683781949803233, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.064453, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.052643, "kv_vq_loss": 0.000486, "learning_rate": 0.0009697739698750182, "loss": 0.053149, "step": 7570, "value_mse_loss_layer_000": 0.000618, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006226, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.038086, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.04834, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.061768, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000427, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000402, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000414, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000444, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.303344, "epoch": 0.00758, "grad_norm": 0.003084969474002719, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.0526, "kv_vq_loss": 0.000491, "learning_rate": 0.0009699173014080133, "loss": 0.053107, "step": 7580, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001816, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.023926, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.02478, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.029541, "value_mse_loss_layer_023": 0.036621, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.052734, "value_mse_loss_layer_028": 0.053467, "value_mse_loss_layer_029": 0.079102, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000359, "vq_loss_layer_014": 0.000473, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000425, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.243557, "epoch": 0.00759, "grad_norm": 0.002440792741253972, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.052719, "kv_vq_loss": 0.000492, "learning_rate": 0.0009700604439738701, "loss": 0.053223, "step": 7590, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006256, "value_mse_loss_layer_003": 0.010681, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014526, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.030273, "value_mse_loss_layer_022": 0.028198, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.065918, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.273374, "epoch": 0.0076, "grad_norm": 0.002376687480136752, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.052347, "kv_vq_loss": 0.000499, "learning_rate": 0.0009702033980701978, "loss": 0.052853, "step": 7600, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.009277, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.019531, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000239, "vq_loss_layer_012": 0.000553, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000418, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.240725, "epoch": 0.00761, "grad_norm": 0.002379770390689373, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.111816, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.052829, "kv_vq_loss": 0.000493, "learning_rate": 0.000970346164192643, "loss": 0.053326, "step": 7610, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001755, "value_mse_loss_layer_002": 0.0065, "value_mse_loss_layer_003": 0.010437, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009705, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.012451, "value_mse_loss_layer_008": 0.014893, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.022949, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000206, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000296, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000236, "vq_loss_layer_012": 0.000416, "vq_loss_layer_013": 0.000362, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000931, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.262002, "epoch": 0.00762, "grad_norm": 0.0026005522813647985, "key_mse_loss_layer_000": 0.004303, "key_mse_loss_layer_001": 0.013, "key_mse_loss_layer_002": 0.067871, "key_mse_loss_layer_003": 0.054199, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.120117, "key_mse_loss_layer_019": 0.102051, "key_mse_loss_layer_020": 0.112305, "key_mse_loss_layer_021": 0.104492, "key_mse_loss_layer_022": 0.117676, "key_mse_loss_layer_023": 0.123047, "key_mse_loss_layer_024": 0.103027, "key_mse_loss_layer_025": 0.099121, "key_mse_loss_layer_026": 0.113281, "key_mse_loss_layer_027": 0.12793, "key_mse_loss_layer_028": 0.125977, "key_mse_loss_layer_029": 0.126953, "key_mse_loss_layer_030": 0.126953, "key_mse_loss_layer_031": 0.103027, "kv_mse_loss": 0.052719, "kv_vq_loss": 0.000484, "learning_rate": 0.0009704887428349001, "loss": 0.05322, "step": 7620, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001915, "value_mse_loss_layer_002": 0.007141, "value_mse_loss_layer_003": 0.012207, "value_mse_loss_layer_004": 0.011169, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.020996, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.027954, "value_mse_loss_layer_022": 0.029907, "value_mse_loss_layer_023": 0.040527, "value_mse_loss_layer_024": 0.043213, "value_mse_loss_layer_025": 0.050537, "value_mse_loss_layer_026": 0.050049, "value_mse_loss_layer_027": 0.06543, "value_mse_loss_layer_028": 0.07959, "value_mse_loss_layer_029": 0.100586, "value_mse_loss_layer_030": 0.098633, "value_mse_loss_layer_031": 0.091797, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2.1e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 5.5e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000133, "vq_loss_layer_008": 0.000229, "vq_loss_layer_009": 0.000171, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000265, "vq_loss_layer_013": 0.000182, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000243, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000241, "vq_loss_layer_018": 0.000296, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000129, "vq_loss_layer_021": 0.000188, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.00045, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.001884, "vq_loss_layer_029": 0.002747, "vq_loss_layer_030": 0.00296, "vq_loss_layer_031": 0.006775 }, { "ce_loss": 2.281728, "epoch": 0.00763, "grad_norm": 0.0020412756130099297, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.097656, "key_mse_loss_layer_010": 0.109863, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.143555, "key_mse_loss_layer_014": 0.140625, "key_mse_loss_layer_015": 0.125, "key_mse_loss_layer_016": 0.116699, "key_mse_loss_layer_017": 0.115234, "key_mse_loss_layer_018": 0.118164, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.111328, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.110352, "key_mse_loss_layer_023": 0.106934, "key_mse_loss_layer_024": 0.084961, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.095215, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.100586, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.052451, "kv_vq_loss": 0.0005, "learning_rate": 0.0009706311344887199, "loss": 0.052957, "step": 7630, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006958, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.013, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.020508, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.017822, "value_mse_loss_layer_012": 0.020142, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020142, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000179, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000256, "vq_loss_layer_009": 0.00033, "vq_loss_layer_010": 0.000296, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.00058, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000511, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.000488, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.000469, "vq_loss_layer_022": 0.000345, "vq_loss_layer_023": 0.000336, "vq_loss_layer_024": 0.000349, "vq_loss_layer_025": 0.000504, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.001343, "vq_loss_layer_029": 0.001396, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.005219 }, { "ce_loss": 2.264422, "epoch": 0.00764, "grad_norm": 0.0023864982649683952, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.052441, "kv_vq_loss": 0.000499, "learning_rate": 0.0009707733396439223, "loss": 0.052954, "step": 7640, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014648, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.023193, "value_mse_loss_layer_016": 0.019409, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.025757, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.040039, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.065918, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000265, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000355, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.003769 }, { "ce_loss": 2.339441, "epoch": 0.00765, "grad_norm": 0.0020300368778407574, "key_mse_loss_layer_000": 0.005249, "key_mse_loss_layer_001": 0.012695, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.106445, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.10498, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.09375, "key_mse_loss_layer_027": 0.095703, "key_mse_loss_layer_028": 0.099121, "key_mse_loss_layer_029": 0.09375, "key_mse_loss_layer_030": 0.10498, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.052319, "kv_vq_loss": 0.000477, "learning_rate": 0.0009709153587884043, "loss": 0.052811, "step": 7650, "value_mse_loss_layer_000": 0.000637, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015503, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.028198, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.071289, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000221, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001404, "vq_loss_layer_030": 0.002853, "vq_loss_layer_031": 0.005463 }, { "ce_loss": 2.278613, "epoch": 0.00766, "grad_norm": 0.0025703643914312124, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.052258, "kv_vq_loss": 0.000473, "learning_rate": 0.0009710571924081508, "loss": 0.052753, "step": 7660, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.009399, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.014648, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.02832, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000416, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.30351, "epoch": 0.00767, "grad_norm": 0.002622573170810938, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.052182, "kv_vq_loss": 0.000484, "learning_rate": 0.0009711988409872451, "loss": 0.052682, "step": 7670, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006866, "value_mse_loss_layer_003": 0.011414, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.01239, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.020142, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.030273, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000164, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000435, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003799 }, { "ce_loss": 2.286784, "epoch": 0.00768, "grad_norm": 0.002141160424798727, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.060547, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.05275, "kv_vq_loss": 0.000499, "learning_rate": 0.0009713403050078778, "loss": 0.053265, "step": 7680, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.014648, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.019653, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.028076, "value_mse_loss_layer_022": 0.029419, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.036621, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.068359, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.000504, "vq_loss_layer_013": 0.000376, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.289416, "epoch": 0.00769, "grad_norm": 0.001981170382350683, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.132812, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.116699, "key_mse_loss_layer_016": 0.112305, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.117676, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.08252, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.05231, "kv_vq_loss": 0.000491, "learning_rate": 0.0009714815849503576, "loss": 0.052814, "step": 7690, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.006592, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015503, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000248, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000244, "vq_loss_layer_012": 0.000397, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.316053, "epoch": 0.0077, "grad_norm": 0.0022878386080265045, "key_mse_loss_layer_000": 0.003998, "key_mse_loss_layer_001": 0.011597, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.084473, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.093262, "key_mse_loss_layer_027": 0.095215, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.099121, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.052267, "kv_vq_loss": 0.00048, "learning_rate": 0.0009716226812931203, "loss": 0.052768, "step": 7700, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001808, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.010498, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015137, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.034912, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.07666, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000225, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.00022, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000338, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001556, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.302349, "epoch": 0.00771, "grad_norm": 0.002563067711889744, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.052805, "kv_vq_loss": 0.000501, "learning_rate": 0.0009717635945127391, "loss": 0.053299, "step": 7710, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.007111, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009399, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.011597, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.014771, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.023438, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.028687, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.040039, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.049316, "value_mse_loss_layer_028": 0.057373, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.064453, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000143, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000234, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000301, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.001129, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.290398, "epoch": 0.00772, "grad_norm": 0.0028103026561439037, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.053308, "kv_vq_loss": 0.00051, "learning_rate": 0.0009719043250839339, "loss": 0.053821, "step": 7720, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001762, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.010193, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.012451, "value_mse_loss_layer_008": 0.015015, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.019775, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.019043, "value_mse_loss_layer_013": 0.019775, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.023071, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.022949, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.044678, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.065918, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 9.8e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000301, "vq_loss_layer_011": 0.000244, "vq_loss_layer_012": 0.000475, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000294, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000549, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000383, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000839, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000847, "vq_loss_layer_029": 0.000923, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.324277, "epoch": 0.00773, "grad_norm": 0.0027674459852278233, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052237, "kv_vq_loss": 0.000474, "learning_rate": 0.0009720448734795811, "loss": 0.052728, "step": 7730, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001755, "value_mse_loss_layer_002": 0.006378, "value_mse_loss_layer_003": 0.010498, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.014709, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.043701, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000191, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000395, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.28539, "epoch": 0.00774, "grad_norm": 0.0021120496094226837, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.061035, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.052396, "kv_vq_loss": 0.000482, "learning_rate": 0.0009721852401707229, "loss": 0.05289, "step": 7740, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.015259, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.022949, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.022827, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.024414, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000201, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.00045, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.355074, "epoch": 0.00775, "grad_norm": 0.0024214552249759436, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.052148, "kv_vq_loss": 0.000476, "learning_rate": 0.0009723254256265775, "loss": 0.052643, "step": 7750, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.00647, "value_mse_loss_layer_003": 0.011475, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.014587, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.021973, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.022949, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.028687, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.047852, "value_mse_loss_layer_028": 0.052246, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000668, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.359564, "epoch": 0.00776, "grad_norm": 0.0030413956847041845, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052432, "kv_vq_loss": 0.000488, "learning_rate": 0.000972465430314547, "loss": 0.05293, "step": 7760, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001762, "value_mse_loss_layer_002": 0.006317, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.010193, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.011414, "value_mse_loss_layer_007": 0.012512, "value_mse_loss_layer_008": 0.015259, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.017944, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.018677, "value_mse_loss_layer_013": 0.019409, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.026245, "value_mse_loss_layer_021": 0.028076, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.080566, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 9.3e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000223, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000456, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.000486, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000243, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.001167, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.270005, "epoch": 0.00777, "grad_norm": 0.002158788265660405, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.052597, "kv_vq_loss": 0.000485, "learning_rate": 0.0009726052547002284, "loss": 0.053094, "step": 7770, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006226, "value_mse_loss_layer_003": 0.010498, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.014465, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019897, "value_mse_loss_layer_014": 0.021118, "value_mse_loss_layer_015": 0.023438, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.030273, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000244, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.000519, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.00046, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000376, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.002106, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.29411, "epoch": 0.00778, "grad_norm": 0.001956962514668703, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.062988, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.095215, "key_mse_loss_layer_009": 0.103516, "key_mse_loss_layer_010": 0.116211, "key_mse_loss_layer_011": 0.11084, "key_mse_loss_layer_012": 0.085938, "key_mse_loss_layer_013": 0.150391, "key_mse_loss_layer_014": 0.147461, "key_mse_loss_layer_015": 0.134766, "key_mse_loss_layer_016": 0.128906, "key_mse_loss_layer_017": 0.125, "key_mse_loss_layer_018": 0.133789, "key_mse_loss_layer_019": 0.105957, "key_mse_loss_layer_020": 0.122559, "key_mse_loss_layer_021": 0.116699, "key_mse_loss_layer_022": 0.12207, "key_mse_loss_layer_023": 0.117676, "key_mse_loss_layer_024": 0.09375, "key_mse_loss_layer_025": 0.087402, "key_mse_loss_layer_026": 0.104004, "key_mse_loss_layer_027": 0.102051, "key_mse_loss_layer_028": 0.106445, "key_mse_loss_layer_029": 0.095215, "key_mse_loss_layer_030": 0.104004, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.052277, "kv_vq_loss": 0.000487, "learning_rate": 0.0009727448992474221, "loss": 0.052786, "step": 7780, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001755, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.011108, "value_mse_loss_layer_004": 0.010376, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.011719, "value_mse_loss_layer_007": 0.012512, "value_mse_loss_layer_008": 0.014648, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 4.4e-05, "vq_loss_layer_004": 8.5e-05, "vq_loss_layer_005": 0.0001, "vq_loss_layer_006": 0.000168, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000263, "vq_loss_layer_009": 0.000334, "vq_loss_layer_010": 0.000311, "vq_loss_layer_011": 0.000299, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000387, "vq_loss_layer_014": 0.000549, "vq_loss_layer_015": 0.000519, "vq_loss_layer_016": 0.000511, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000263, "vq_loss_layer_020": 0.000322, "vq_loss_layer_021": 0.000553, "vq_loss_layer_022": 0.000431, "vq_loss_layer_023": 0.000385, "vq_loss_layer_024": 0.000526, "vq_loss_layer_025": 0.000717, "vq_loss_layer_026": 0.000679, "vq_loss_layer_027": 0.000866, "vq_loss_layer_028": 0.001404, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.004364, "vq_loss_layer_031": 0.005768 }, { "ce_loss": 2.360568, "epoch": 0.00779, "grad_norm": 0.002674391260370612, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.052081, "kv_vq_loss": 0.000492, "learning_rate": 0.000972884364418141, "loss": 0.052591, "step": 7790, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006348, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.012329, "value_mse_loss_layer_008": 0.015076, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.02124, "value_mse_loss_layer_013": 0.020264, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.022461, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.023071, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.030518, "value_mse_loss_layer_022": 0.027344, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000614, "vq_loss_layer_013": 0.000364, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000422, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000463, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.00046, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000759, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.004303 }, { "ce_loss": 2.338089, "epoch": 0.0078, "grad_norm": 0.0022695749066770077, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.052115, "kv_vq_loss": 0.000483, "learning_rate": 0.0009730236506726199, "loss": 0.052612, "step": 7800, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001762, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.014893, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.0177, "value_mse_loss_layer_012": 0.018311, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.020508, "value_mse_loss_layer_015": 0.022949, "value_mse_loss_layer_016": 0.019287, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.029297, "value_mse_loss_layer_022": 0.029907, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000395, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000368, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.308413, "epoch": 0.00781, "grad_norm": 0.0022759067360311747, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.0625, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.05278, "kv_vq_loss": 0.000478, "learning_rate": 0.000973162758469325, "loss": 0.053281, "step": 7810, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001755, "value_mse_loss_layer_002": 0.006439, "value_mse_loss_layer_003": 0.010193, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008789, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014709, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.02063, "value_mse_loss_layer_017": 0.022949, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.024536, "value_mse_loss_layer_020": 0.025635, "value_mse_loss_layer_021": 0.031128, "value_mse_loss_layer_022": 0.029907, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.045166, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.065918, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000467, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000209, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.302566, "epoch": 0.00782, "grad_norm": 0.0025524324737489223, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.052145, "kv_vq_loss": 0.000484, "learning_rate": 0.0009733016882649619, "loss": 0.05264, "step": 7820, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001801, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010071, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.012329, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.025146, "value_mse_loss_layer_021": 0.027832, "value_mse_loss_layer_022": 0.029175, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000201, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000446, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.003494 }, { "ce_loss": 2.328783, "epoch": 0.00783, "grad_norm": 0.0018233581213280559, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.051956, "kv_vq_loss": 0.00047, "learning_rate": 0.0009734404405144857, "loss": 0.052451, "step": 7830, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006561, "value_mse_loss_layer_003": 0.013367, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009705, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.012207, "value_mse_loss_layer_008": 0.014893, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.000973, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.003723 }, { "ce_loss": 2.275061, "epoch": 0.00784, "grad_norm": 0.0028858110308647156, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.089844, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.052316, "kv_vq_loss": 0.00049, "learning_rate": 0.0009735790156711095, "loss": 0.052823, "step": 7840, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001747, "value_mse_loss_layer_002": 0.006866, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015198, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.0271, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000211, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000393, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.000664, "vq_loss_layer_028": 0.00106, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.298754, "epoch": 0.00785, "grad_norm": 0.002199376467615366, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.052551, "kv_vq_loss": 0.000476, "learning_rate": 0.000973717414186313, "loss": 0.053043, "step": 7850, "value_mse_loss_layer_000": 0.000607, "value_mse_loss_layer_001": 0.001778, "value_mse_loss_layer_002": 0.006561, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.009155, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014526, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.016357, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.017456, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.018555, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.003723 }, { "ce_loss": 2.256562, "epoch": 0.00786, "grad_norm": 0.0021063508465886116, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.052292, "kv_vq_loss": 0.000482, "learning_rate": 0.0009738556365098519, "loss": 0.052792, "step": 7860, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.01178, "value_mse_loss_layer_008": 0.014404, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.02063, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.022461, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.026489, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.029175, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000234, "vq_loss_layer_012": 0.000439, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000429, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.00022, "vq_loss_layer_019": 0.000235, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.296982, "epoch": 0.00787, "grad_norm": 0.0031807878986001015, "key_mse_loss_layer_000": 0.004486, "key_mse_loss_layer_001": 0.013, "key_mse_loss_layer_002": 0.069336, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.082031, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.092285, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.101074, "key_mse_loss_layer_014": 0.100586, "key_mse_loss_layer_015": 0.092285, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.103027, "key_mse_loss_layer_024": 0.085938, "key_mse_loss_layer_025": 0.083984, "key_mse_loss_layer_026": 0.095215, "key_mse_loss_layer_027": 0.103027, "key_mse_loss_layer_028": 0.106445, "key_mse_loss_layer_029": 0.100586, "key_mse_loss_layer_030": 0.098633, "key_mse_loss_layer_031": 0.085938, "kv_mse_loss": 0.052237, "kv_vq_loss": 0.000476, "learning_rate": 0.0009739936830897659, "loss": 0.052737, "step": 7870, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001831, "value_mse_loss_layer_002": 0.007568, "value_mse_loss_layer_003": 0.012573, "value_mse_loss_layer_004": 0.01062, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.024292, "value_mse_loss_layer_020": 0.026489, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.03125, "value_mse_loss_layer_023": 0.036621, "value_mse_loss_layer_024": 0.039551, "value_mse_loss_layer_025": 0.049805, "value_mse_loss_layer_026": 0.057617, "value_mse_loss_layer_027": 0.059082, "value_mse_loss_layer_028": 0.071777, "value_mse_loss_layer_029": 0.086426, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.08252, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2.3e-05, "vq_loss_layer_002": 2.5e-05, "vq_loss_layer_003": 6.2e-05, "vq_loss_layer_004": 8.4e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000175, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000232, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000153, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.00119, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.001694, "vq_loss_layer_029": 0.001999, "vq_loss_layer_030": 0.003052, "vq_loss_layer_031": 0.006836 }, { "ce_loss": 2.296342, "epoch": 0.00788, "grad_norm": 0.002194394590333104, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.112793, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.062012, "kv_mse_loss": 0.052664, "kv_vq_loss": 0.000489, "learning_rate": 0.0009741315543723887, "loss": 0.053171, "step": 7880, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001701, "value_mse_loss_layer_002": 0.006378, "value_mse_loss_layer_003": 0.010681, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.019897, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 9.4e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000208, "vq_loss_layer_009": 0.000311, "vq_loss_layer_010": 0.000236, "vq_loss_layer_011": 0.000259, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00041, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000243, "vq_loss_layer_021": 0.000402, "vq_loss_layer_022": 0.000347, "vq_loss_layer_023": 0.00034, "vq_loss_layer_024": 0.000391, "vq_loss_layer_025": 0.00046, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002838, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.311526, "epoch": 0.00789, "grad_norm": 0.0021846576128154993, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052057, "kv_vq_loss": 0.000479, "learning_rate": 0.000974269250802355, "loss": 0.052554, "step": 7890, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001778, "value_mse_loss_layer_002": 0.006256, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.014465, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.015381, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.019531, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.022461, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.031128, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.062256, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000519, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.29991, "epoch": 0.0079, "grad_norm": 0.0023687819484621286, "key_mse_loss_layer_000": 0.003647, "key_mse_loss_layer_001": 0.011353, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.065918, "key_mse_loss_layer_006": 0.073242, "key_mse_loss_layer_007": 0.081055, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.052435, "kv_vq_loss": 0.000492, "learning_rate": 0.0009744067728226102, "loss": 0.052939, "step": 7900, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.0065, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.01001, "value_mse_loss_layer_005": 0.009705, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.017578, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.028809, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.036377, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.051025, "value_mse_loss_layer_028": 0.052246, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.070801, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000198, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.000298, "vq_loss_layer_012": 0.000412, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000439, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000763, "vq_loss_layer_028": 0.000847, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.323388, "epoch": 0.00791, "grad_norm": 0.002455032430589199, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.094727, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.052161, "kv_vq_loss": 0.000478, "learning_rate": 0.000974544120874419, "loss": 0.052658, "step": 7910, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.010437, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.014526, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.034668, "value_mse_loss_layer_024": 0.038574, "value_mse_loss_layer_025": 0.045166, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000225, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000205, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.00473 }, { "ce_loss": 2.353737, "epoch": 0.00792, "grad_norm": 0.00201026676222682, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.052133, "kv_vq_loss": 0.000475, "learning_rate": 0.0009746812953973733, "loss": 0.052634, "step": 7920, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.006439, "value_mse_loss_layer_003": 0.011719, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.01178, "value_mse_loss_layer_008": 0.014709, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.020752, "value_mse_loss_layer_015": 0.022705, "value_mse_loss_layer_016": 0.020996, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.02832, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000437, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000234, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.003891 }, { "ce_loss": 2.33439, "epoch": 0.00793, "grad_norm": 0.0020484745036810637, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.052072, "kv_vq_loss": 0.000474, "learning_rate": 0.0009748182968294008, "loss": 0.052563, "step": 7930, "value_mse_loss_layer_000": 0.000599, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.010559, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.012329, "value_mse_loss_layer_007": 0.012573, "value_mse_loss_layer_008": 0.014526, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.017456, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.018799, "value_mse_loss_layer_017": 0.022949, "value_mse_loss_layer_018": 0.020386, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.028442, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 9.5e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000187, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.003784 }, { "ce_loss": 2.288379, "epoch": 0.00794, "grad_norm": 0.0029238564893603325, "key_mse_loss_layer_000": 0.004425, "key_mse_loss_layer_001": 0.011719, "key_mse_loss_layer_002": 0.061523, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.092773, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.089355, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.052417, "kv_vq_loss": 0.000497, "learning_rate": 0.0009749551256067739, "loss": 0.052917, "step": 7940, "value_mse_loss_layer_000": 0.000603, "value_mse_loss_layer_001": 0.001823, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.010681, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.014893, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015198, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.020752, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.02478, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.029541, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.04248, "value_mse_loss_layer_027": 0.053223, "value_mse_loss_layer_028": 0.059326, "value_mse_loss_layer_029": 0.09082, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.073242, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.00024, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.000228, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000433, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000305, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.00042, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.000675, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.002167, "vq_loss_layer_030": 0.002747, "vq_loss_layer_031": 0.005554 }, { "ce_loss": 2.321549, "epoch": 0.00795, "grad_norm": 0.001973719336092472, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.106934, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.052258, "kv_vq_loss": 0.000473, "learning_rate": 0.0009750917821641175, "loss": 0.052756, "step": 7950, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001694, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.012451, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.016602, "value_mse_loss_layer_011": 0.017456, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.02002, "value_mse_loss_layer_014": 0.020996, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.027954, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.070312, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.00046, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000296, "vq_loss_layer_021": 0.000456, "vq_loss_layer_022": 0.00034, "vq_loss_layer_023": 0.00041, "vq_loss_layer_024": 0.000383, "vq_loss_layer_025": 0.000755, "vq_loss_layer_026": 0.000778, "vq_loss_layer_027": 0.00082, "vq_loss_layer_028": 0.001404, "vq_loss_layer_029": 0.001984, "vq_loss_layer_030": 0.003601, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.273169, "epoch": 0.00796, "grad_norm": 0.002020174404606223, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.052426, "kv_vq_loss": 0.000484, "learning_rate": 0.000975228266934417, "loss": 0.052921, "step": 7960, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.006195, "value_mse_loss_layer_003": 0.010925, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.010315, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.024414, "value_mse_loss_layer_021": 0.029907, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 9.6e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000191, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000389, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.001404, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.330052, "epoch": 0.00797, "grad_norm": 0.0022649962920695543, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.051929, "kv_vq_loss": 0.000483, "learning_rate": 0.0009753645803490279, "loss": 0.052429, "step": 7970, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006805, "value_mse_loss_layer_003": 0.010681, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014282, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.017822, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.0271, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000401, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000435, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.003937 }, { "ce_loss": 2.332219, "epoch": 0.00798, "grad_norm": 0.0022211051546037197, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051871, "kv_vq_loss": 0.000471, "learning_rate": 0.0009755007228376823, "loss": 0.052365, "step": 7980, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.012634, "value_mse_loss_layer_008": 0.015137, "value_mse_loss_layer_009": 0.019409, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.021851, "value_mse_loss_layer_015": 0.022949, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.024414, "value_mse_loss_layer_021": 0.028687, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.00022, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000439, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000483, "vq_loss_layer_015": 0.000471, "vq_loss_layer_016": 0.000427, "vq_loss_layer_017": 0.000385, "vq_loss_layer_018": 0.000299, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000303, "vq_loss_layer_023": 0.000431, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.003723 }, { "ce_loss": 2.290983, "epoch": 0.00799, "grad_norm": 0.002971507143229246, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052197, "kv_vq_loss": 0.0005, "learning_rate": 0.0009756366948284976, "loss": 0.052698, "step": 7990, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001778, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.010132, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014587, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.016479, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.023438, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.042236, "value_mse_loss_layer_027": 0.05249, "value_mse_loss_layer_028": 0.052246, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.072266, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.000278, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000416, "vq_loss_layer_017": 0.000416, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000626, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.003662, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.331132, "epoch": 0.008, "grad_norm": 0.0017181899165734649, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051926, "kv_vq_loss": 0.000474, "learning_rate": 0.0009757724967479857, "loss": 0.05242, "step": 8000, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006104, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.019409, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.022827, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.022705, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.02771, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.061523, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.30541, "epoch": 0.00801, "grad_norm": 0.002797102089971304, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.052631, "kv_vq_loss": 0.000483, "learning_rate": 0.0009759081290210593, "loss": 0.053125, "step": 8010, "value_mse_loss_layer_000": 0.00061, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006439, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.012085, "value_mse_loss_layer_008": 0.014282, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015381, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.017456, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.028076, "value_mse_loss_layer_022": 0.029175, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.05127, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000448, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000463, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.299331, "epoch": 0.00802, "grad_norm": 0.002366461791098118, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.052399, "kv_vq_loss": 0.000483, "learning_rate": 0.0009760435920710406, "loss": 0.052893, "step": 8020, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001778, "value_mse_loss_layer_002": 0.006226, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.025024, "value_mse_loss_layer_021": 0.030029, "value_mse_loss_layer_022": 0.029785, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.036621, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.046143, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.310801, "epoch": 0.00803, "grad_norm": 0.0021006097085773945, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.051923, "kv_vq_loss": 0.00047, "learning_rate": 0.00097617888631967, "loss": 0.052417, "step": 8030, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001755, "value_mse_loss_layer_002": 0.006775, "value_mse_loss_layer_003": 0.010498, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009583, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.01416, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.041748, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000416, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000378, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000244, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000467, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000675, "vq_loss_layer_028": 0.000969, "vq_loss_layer_029": 0.001137, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.004791 }, { "ce_loss": 2.318627, "epoch": 0.00804, "grad_norm": 0.0029410389252007008, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.052151, "kv_vq_loss": 0.0005, "learning_rate": 0.0009763140121871126, "loss": 0.052658, "step": 8040, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001747, "value_mse_loss_layer_002": 0.006195, "value_mse_loss_layer_003": 0.010071, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.044678, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000515, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000219, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000935, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.384538, "epoch": 0.00805, "grad_norm": 0.002778549212962389, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.051831, "kv_vq_loss": 0.000473, "learning_rate": 0.0009764489700919671, "loss": 0.052325, "step": 8050, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008789, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.015442, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.023682, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.03064, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000385, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000504, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.312726, "epoch": 0.00806, "grad_norm": 0.0020967607852071524, "key_mse_loss_layer_000": 0.004028, "key_mse_loss_layer_001": 0.012817, "key_mse_loss_layer_002": 0.067383, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.10498, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.103027, "key_mse_loss_layer_024": 0.086914, "key_mse_loss_layer_025": 0.084473, "key_mse_loss_layer_026": 0.095703, "key_mse_loss_layer_027": 0.105957, "key_mse_loss_layer_028": 0.107422, "key_mse_loss_layer_029": 0.104492, "key_mse_loss_layer_030": 0.102539, "key_mse_loss_layer_031": 0.088379, "kv_mse_loss": 0.052469, "kv_vq_loss": 0.000471, "learning_rate": 0.0009765837604512725, "loss": 0.052979, "step": 8060, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001793, "value_mse_loss_layer_002": 0.006714, "value_mse_loss_layer_003": 0.011658, "value_mse_loss_layer_004": 0.01001, "value_mse_loss_layer_005": 0.009888, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.02478, "value_mse_loss_layer_019": 0.024292, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.02771, "value_mse_loss_layer_022": 0.030884, "value_mse_loss_layer_023": 0.036133, "value_mse_loss_layer_024": 0.040527, "value_mse_loss_layer_025": 0.049316, "value_mse_loss_layer_026": 0.048584, "value_mse_loss_layer_027": 0.068848, "value_mse_loss_layer_028": 0.063477, "value_mse_loss_layer_029": 0.084473, "value_mse_loss_layer_030": 0.086914, "value_mse_loss_layer_031": 0.080078, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 2e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4.7e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000207, "vq_loss_layer_009": 0.000177, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000237, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000315, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000148, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.0002, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000483, "vq_loss_layer_026": 0.000675, "vq_loss_layer_027": 0.000931, "vq_loss_layer_028": 0.001518, "vq_loss_layer_029": 0.002121, "vq_loss_layer_030": 0.00325, "vq_loss_layer_031": 0.006256 }, { "ce_loss": 2.328378, "epoch": 0.00807, "grad_norm": 0.002608675044029951, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.052158, "kv_vq_loss": 0.000477, "learning_rate": 0.0009767183836805174, "loss": 0.052655, "step": 8070, "value_mse_loss_layer_000": 0.000629, "value_mse_loss_layer_001": 0.001747, "value_mse_loss_layer_002": 0.006378, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.011353, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.014465, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.066895, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000226, "vq_loss_layer_011": 0.000238, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000406, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000486, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000923, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.275985, "epoch": 0.00808, "grad_norm": 0.002290121978148818, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.05238, "kv_vq_loss": 0.000486, "learning_rate": 0.0009768528401936465, "loss": 0.052872, "step": 8080, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006317, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008728, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.02356, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.029907, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000463, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.284651, "epoch": 0.00809, "grad_norm": 0.0022898607421666384, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.052295, "kv_vq_loss": 0.000477, "learning_rate": 0.000976987130403068, "loss": 0.052786, "step": 8090, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006073, "value_mse_loss_layer_003": 0.010071, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.011597, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015442, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.064453, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.003616 }, { "ce_loss": 2.284341, "epoch": 0.0081, "grad_norm": 0.002306660870090127, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051993, "kv_vq_loss": 0.000474, "learning_rate": 0.0009771212547196623, "loss": 0.05249, "step": 8100, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001755, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.010254, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 9.1e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000235, "vq_loss_layer_010": 0.000223, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.000393, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000923, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.320798, "epoch": 0.00811, "grad_norm": 0.0029267503414303064, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.052313, "kv_vq_loss": 0.00049, "learning_rate": 0.0009772552135527887, "loss": 0.052823, "step": 8110, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001747, "value_mse_loss_layer_002": 0.006195, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.018555, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.056152, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.001785, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.273523, "epoch": 0.00812, "grad_norm": 0.0027367642614990473, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.052213, "kv_vq_loss": 0.000486, "learning_rate": 0.0009773890073102936, "loss": 0.052725, "step": 8120, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001785, "value_mse_loss_layer_002": 0.006165, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.014832, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.049805, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.061523, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000203, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000355, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000393, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001648, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.283507, "epoch": 0.00813, "grad_norm": 0.0021429562475532293, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.097656, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.109863, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.052173, "kv_vq_loss": 0.000462, "learning_rate": 0.000977522636398517, "loss": 0.052667, "step": 8130, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006165, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000218, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.000239, "vq_loss_layer_011": 0.000288, "vq_loss_layer_012": 0.000416, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000465, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000433, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000458, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000866, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.33964, "epoch": 0.00814, "grad_norm": 0.0020055414643138647, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.05213, "kv_vq_loss": 0.000472, "learning_rate": 0.0009776561012223002, "loss": 0.052628, "step": 8140, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006927, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.009705, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.020508, "value_mse_loss_layer_019": 0.023071, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.02832, "value_mse_loss_layer_022": 0.028687, "value_mse_loss_layer_023": 0.035156, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.00037, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000395, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.270476, "epoch": 0.00815, "grad_norm": 0.002618312370032072, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.051859, "kv_vq_loss": 0.000476, "learning_rate": 0.000977789402184994, "loss": 0.052353, "step": 8150, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.009277, "value_mse_loss_layer_005": 0.009155, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.014282, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.020386, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018677, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.032715, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.066895, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000446, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.004028 }, { "ce_loss": 2.313466, "epoch": 0.00816, "grad_norm": 0.0017764559015631676, "key_mse_loss_layer_000": 0.003616, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.05835, "kv_mse_loss": 0.051901, "kv_vq_loss": 0.00047, "learning_rate": 0.0009779225396884651, "loss": 0.052405, "step": 8160, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001762, "value_mse_loss_layer_002": 0.006226, "value_mse_loss_layer_003": 0.010498, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014404, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.01532, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.066406, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000496, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000425, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.004761 }, { "ce_loss": 2.287482, "epoch": 0.00817, "grad_norm": 0.002201682887971401, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052225, "kv_vq_loss": 0.000466, "learning_rate": 0.0009780555141331037, "loss": 0.052719, "step": 8170, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001678, "value_mse_loss_layer_002": 0.006256, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.011292, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.01416, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.016113, "value_mse_loss_layer_011": 0.017334, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000191, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000481, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.312556, "epoch": 0.00818, "grad_norm": 0.0029483414255082607, "key_mse_loss_layer_000": 0.003647, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.0517, "kv_vq_loss": 0.00049, "learning_rate": 0.0009781883259178308, "loss": 0.052197, "step": 8180, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.007111, "value_mse_loss_layer_003": 0.011719, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014954, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.015381, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018921, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.02124, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.070312, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.00023, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000345, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.00021, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000376, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001251, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.005096 }, { "ce_loss": 2.264054, "epoch": 0.00819, "grad_norm": 0.002270611235871911, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.040771, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.07373, "key_mse_loss_layer_030": 0.074219, "key_mse_loss_layer_031": 0.054932, "kv_mse_loss": 0.052182, "kv_vq_loss": 0.000479, "learning_rate": 0.0009783209754401045, "loss": 0.052676, "step": 8190, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.010071, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011475, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.014648, "value_mse_loss_layer_009": 0.020752, "value_mse_loss_layer_010": 0.0177, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.019287, "value_mse_loss_layer_013": 0.020142, "value_mse_loss_layer_014": 0.021362, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.064453, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000161, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000228, "vq_loss_layer_009": 0.000362, "vq_loss_layer_010": 0.000338, "vq_loss_layer_011": 0.000296, "vq_loss_layer_012": 0.000526, "vq_loss_layer_013": 0.000364, "vq_loss_layer_014": 0.000526, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000437, "vq_loss_layer_017": 0.000402, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000454, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.00046, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000725, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.005341 }, { "ce_loss": 2.243675, "epoch": 0.0082, "grad_norm": 0.00234781950712204, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.052469, "kv_vq_loss": 0.000489, "learning_rate": 0.000978453463095929, "loss": 0.052969, "step": 8200, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006348, "value_mse_loss_layer_003": 0.010437, "value_mse_loss_layer_004": 0.009888, "value_mse_loss_layer_005": 0.009399, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.012329, "value_mse_loss_layer_008": 0.014404, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.01709, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.02478, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.067383, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000256, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000261, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000486, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.00024, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000385, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.001358, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.00473 }, { "ce_loss": 2.267754, "epoch": 0.00821, "grad_norm": 0.0021328283473849297, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051993, "kv_vq_loss": 0.000488, "learning_rate": 0.0009785857892798601, "loss": 0.052496, "step": 8210, "value_mse_loss_layer_000": 0.000595, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.008606, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.012146, "value_mse_loss_layer_008": 0.014648, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.062256, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.064453, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 2e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000254, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000399, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000414, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.003494, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.300155, "epoch": 0.00822, "grad_norm": 0.002227520104497671, "key_mse_loss_layer_000": 0.003799, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.060303, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.135742, "key_mse_loss_layer_014": 0.132812, "key_mse_loss_layer_015": 0.119141, "key_mse_loss_layer_016": 0.115234, "key_mse_loss_layer_017": 0.112305, "key_mse_loss_layer_018": 0.12207, "key_mse_loss_layer_019": 0.097168, "key_mse_loss_layer_020": 0.109863, "key_mse_loss_layer_021": 0.104004, "key_mse_loss_layer_022": 0.11084, "key_mse_loss_layer_023": 0.108887, "key_mse_loss_layer_024": 0.089355, "key_mse_loss_layer_025": 0.08252, "key_mse_loss_layer_026": 0.097656, "key_mse_loss_layer_027": 0.099121, "key_mse_loss_layer_028": 0.103516, "key_mse_loss_layer_029": 0.096191, "key_mse_loss_layer_030": 0.104004, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.051862, "kv_vq_loss": 0.000469, "learning_rate": 0.0009787179543850125, "loss": 0.052356, "step": 8220, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001747, "value_mse_loss_layer_002": 0.006958, "value_mse_loss_layer_003": 0.010437, "value_mse_loss_layer_004": 0.010742, "value_mse_loss_layer_005": 0.010132, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014282, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015442, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.018799, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 8.9e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.00023, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000557, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000483, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.00028, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000404, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.00045, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.00354, "vq_loss_layer_031": 0.005096 }, { "ce_loss": 2.278168, "epoch": 0.00823, "grad_norm": 0.0036021580453962088, "key_mse_loss_layer_000": 0.003616, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052231, "kv_vq_loss": 0.000486, "learning_rate": 0.0009788499588030673, "loss": 0.052731, "step": 8230, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001717, "value_mse_loss_layer_002": 0.006073, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.01416, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.026855, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.078125, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000236, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000412, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000423, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000336, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.268205, "epoch": 0.00824, "grad_norm": 0.002026776783168316, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052243, "kv_vq_loss": 0.000476, "learning_rate": 0.0009789818029242788, "loss": 0.05274, "step": 8240, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001694, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015503, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.022583, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.022095, "value_mse_loss_layer_020": 0.025024, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.029541, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.061279, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000203, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.003799 }, { "ce_loss": 2.292104, "epoch": 0.00825, "grad_norm": 0.0026040002703666687, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.060791, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.052316, "kv_vq_loss": 0.000471, "learning_rate": 0.000979113487137481, "loss": 0.052817, "step": 8250, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.019653, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.03418, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.072754, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.064453, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000234, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.283644, "epoch": 0.00826, "grad_norm": 0.0020842247176915407, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.103516, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.081055, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.102539, "key_mse_loss_layer_029": 0.095215, "key_mse_loss_layer_030": 0.099121, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.051816, "kv_vq_loss": 0.000476, "learning_rate": 0.0009792450118300956, "loss": 0.05231, "step": 8260, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001717, "value_mse_loss_layer_002": 0.006226, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009949, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.01178, "value_mse_loss_layer_008": 0.014526, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.027832, "value_mse_loss_layer_022": 0.028809, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.047363, "value_mse_loss_layer_026": 0.037842, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.069336, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.069824, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 9.5e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003891 }, { "ce_loss": 2.257081, "epoch": 0.00827, "grad_norm": 0.00206579128280282, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.101562, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.093262, "key_mse_loss_layer_027": 0.09668, "key_mse_loss_layer_028": 0.101074, "key_mse_loss_layer_029": 0.095703, "key_mse_loss_layer_030": 0.099609, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.052167, "kv_vq_loss": 0.00048, "learning_rate": 0.0009793763773881365, "loss": 0.052661, "step": 8270, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010559, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.021851, "value_mse_loss_layer_019": 0.023804, "value_mse_loss_layer_020": 0.026001, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.043701, "value_mse_loss_layer_026": 0.040283, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.073242, "value_mse_loss_layer_030": 0.073242, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000201, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000732, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001808, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.316983, "epoch": 0.00828, "grad_norm": 0.00222676619887352, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.052243, "kv_vq_loss": 0.000461, "learning_rate": 0.00097950758419622, "loss": 0.052734, "step": 8280, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001709, "value_mse_loss_layer_002": 0.006256, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009155, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014404, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015381, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.052246, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.270733, "epoch": 0.00829, "grad_norm": 0.00240313820540905, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.051923, "kv_vq_loss": 0.000467, "learning_rate": 0.0009796386326375683, "loss": 0.052417, "step": 8290, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.01178, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.011963, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015381, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.021851, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.065918, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000542, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000341, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.004242 }, { "ce_loss": 2.303691, "epoch": 0.0083, "grad_norm": 0.0020590710919350386, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052103, "kv_vq_loss": 0.000474, "learning_rate": 0.0009797695230940185, "loss": 0.052603, "step": 8300, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.011169, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.018677, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.06543, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000486, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.305404, "epoch": 0.00831, "grad_norm": 0.0028822841122746468, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.111816, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.051776, "kv_vq_loss": 0.000477, "learning_rate": 0.0009799002559460275, "loss": 0.05228, "step": 8310, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.005981, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.017456, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000259, "vq_loss_layer_011": 0.000244, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000441, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000969, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.004852 }, { "ce_loss": 2.337965, "epoch": 0.00832, "grad_norm": 0.00186544272582978, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.051984, "kv_vq_loss": 0.000479, "learning_rate": 0.0009800308315726807, "loss": 0.052487, "step": 8320, "value_mse_loss_layer_000": 0.000584, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006653, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.014404, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.062012, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.003464 }, { "ce_loss": 2.307941, "epoch": 0.00833, "grad_norm": 0.002147093415260315, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.052032, "kv_vq_loss": 0.000449, "learning_rate": 0.0009801612503516968, "loss": 0.052521, "step": 8330, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.00177, "value_mse_loss_layer_002": 0.006348, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.011536, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.020264, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.068848, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000452, "vq_loss_layer_017": 0.000393, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001137, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.293307, "epoch": 0.00834, "grad_norm": 0.002708028769120574, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.051935, "kv_vq_loss": 0.000475, "learning_rate": 0.0009802915126594345, "loss": 0.052441, "step": 8340, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001709, "value_mse_loss_layer_002": 0.005981, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.014709, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.018433, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.0271, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.284043, "epoch": 0.00835, "grad_norm": 0.0020679577719420195, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052057, "kv_vq_loss": 0.000474, "learning_rate": 0.0009804216188709005, "loss": 0.052557, "step": 8350, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006165, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014465, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.016724, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.031982, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000232, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000378, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.296755, "epoch": 0.00836, "grad_norm": 0.0023676923010498285, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.051678, "kv_vq_loss": 0.000455, "learning_rate": 0.0009805515693597539, "loss": 0.052167, "step": 8360, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001709, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.029175, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.062256, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.003082, "vq_loss_layer_031": 0.003891 }, { "ce_loss": 2.334452, "epoch": 0.00837, "grad_norm": 0.002476691035553813, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.052039, "kv_vq_loss": 0.00047, "learning_rate": 0.000980681364498315, "loss": 0.052533, "step": 8370, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006378, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.02771, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003494 }, { "ce_loss": 2.297291, "epoch": 0.00838, "grad_norm": 0.002362398663535714, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.052188, "kv_vq_loss": 0.000481, "learning_rate": 0.000980811004657569, "loss": 0.052682, "step": 8380, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001732, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008789, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.015137, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.021484, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.027954, "value_mse_loss_layer_022": 0.028198, "value_mse_loss_layer_023": 0.030518, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000961, "vq_loss_layer_029": 0.001472, "vq_loss_layer_030": 0.003387, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.283115, "epoch": 0.00839, "grad_norm": 0.001986056799069047, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.052176, "kv_vq_loss": 0.00048, "learning_rate": 0.000980940490207175, "loss": 0.052686, "step": 8390, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.006104, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009949, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.011047, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014282, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.019653, "value_mse_loss_layer_019": 0.023193, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.032959, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 7.6e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000218, "vq_loss_layer_011": 0.000263, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000507, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000486, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.323974, "epoch": 0.0084, "grad_norm": 0.0019644061103463173, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.051807, "kv_vq_loss": 0.000466, "learning_rate": 0.0009810698215154702, "loss": 0.052301, "step": 8400, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.005981, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015503, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.029419, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.061279, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000214, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.003494 }, { "ce_loss": 2.278402, "epoch": 0.00841, "grad_norm": 0.0020946289878338575, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.05242, "kv_vq_loss": 0.000482, "learning_rate": 0.000981198998949478, "loss": 0.052921, "step": 8410, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.011536, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014587, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.025757, "value_mse_loss_layer_020": 0.024414, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.044434, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000208, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000226, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000408, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000462, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.003677 }, { "ce_loss": 2.292104, "epoch": 0.00842, "grad_norm": 0.0016293668886646628, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.05217, "kv_vq_loss": 0.000476, "learning_rate": 0.000981328022874912, "loss": 0.052673, "step": 8420, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001701, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.015442, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.017456, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.020142, "value_mse_loss_layer_019": 0.023438, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.041748, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.062256, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000395, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000214, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.275825, "epoch": 0.00843, "grad_norm": 0.0019368483917787671, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.052103, "kv_vq_loss": 0.000454, "learning_rate": 0.0009814568936561854, "loss": 0.052594, "step": 8430, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.006073, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008728, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.012207, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015076, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000221, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000385, "vq_loss_layer_018": 0.000239, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.0009, "vq_loss_layer_029": 0.000954, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.278863, "epoch": 0.00844, "grad_norm": 0.0026813701260834932, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.052557, "kv_vq_loss": 0.000481, "learning_rate": 0.0009815856116564138, "loss": 0.053064, "step": 8440, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.005981, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.01416, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.020264, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.022461, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.026245, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.062988, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000954, "vq_loss_layer_028": 0.000614, "vq_loss_layer_029": 0.000973, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003433 }, { "ce_loss": 2.311099, "epoch": 0.00845, "grad_norm": 0.002002533758059144, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.051984, "kv_vq_loss": 0.000467, "learning_rate": 0.000981714177237423, "loss": 0.052478, "step": 8450, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.023682, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.03064, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.325413, "epoch": 0.00846, "grad_norm": 0.002693688264116645, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.081543, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.07373, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.051932, "kv_vq_loss": 0.000479, "learning_rate": 0.0009818425907597557, "loss": 0.052432, "step": 8460, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001678, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.009827, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008728, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.01532, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.041748, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.060303, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.3018, "epoch": 0.00847, "grad_norm": 0.002161189913749695, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.051996, "kv_vq_loss": 0.000485, "learning_rate": 0.0009819708525826765, "loss": 0.052499, "step": 8470, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.019653, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.061768, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000322, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000257, "vq_loss_layer_012": 0.000462, "vq_loss_layer_013": 0.000351, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000374, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.003265, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.262986, "epoch": 0.00848, "grad_norm": 0.002058750484138727, "key_mse_loss_layer_000": 0.003693, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051971, "kv_vq_loss": 0.00047, "learning_rate": 0.0009820989630641784, "loss": 0.052463, "step": 8480, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.00174, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.03125, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.050049, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.070312, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000462, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000427, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.003494 }, { "ce_loss": 2.327522, "epoch": 0.00849, "grad_norm": 0.002108231419697404, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.061279, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.05191, "kv_vq_loss": 0.000468, "learning_rate": 0.0009822269225609882, "loss": 0.052405, "step": 8490, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.017334, "value_mse_loss_layer_011": 0.016968, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.028442, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000233, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.345503, "epoch": 0.0085, "grad_norm": 0.0019841778557747602, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.05199, "kv_vq_loss": 0.000473, "learning_rate": 0.000982354731428573, "loss": 0.052481, "step": 8500, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.019531, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.022949, "value_mse_loss_layer_016": 0.018555, "value_mse_loss_layer_017": 0.023071, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000201, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000284, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000439, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000439, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.279049, "epoch": 0.00851, "grad_norm": 0.002466922625899315, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051767, "kv_vq_loss": 0.000462, "learning_rate": 0.000982482390021147, "loss": 0.052261, "step": 8510, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001694, "value_mse_loss_layer_002": 0.006165, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.009399, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.019897, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.004059 }, { "ce_loss": 2.294239, "epoch": 0.00852, "grad_norm": 0.002513549290597439, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.052167, "kv_vq_loss": 0.000473, "learning_rate": 0.0009826098986916749, "loss": 0.05267, "step": 8520, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001709, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.01416, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015076, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.025879, "value_mse_loss_layer_021": 0.029785, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.030518, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.061523, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000383, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000401, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.271473, "epoch": 0.00853, "grad_norm": 0.001646032091230154, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.051852, "kv_vq_loss": 0.000451, "learning_rate": 0.0009827372577918807, "loss": 0.052347, "step": 8530, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001709, "value_mse_loss_layer_002": 0.006165, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.009155, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.027832, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.282883, "epoch": 0.00854, "grad_norm": 0.002087069908156991, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.051782, "kv_vq_loss": 0.000483, "learning_rate": 0.0009828644676722511, "loss": 0.05228, "step": 8540, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.011536, "value_mse_loss_layer_004": 0.008667, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013672, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015198, "value_mse_loss_layer_011": 0.016357, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.027954, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.062256, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000414, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000372, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.001572, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.004913 }, { "ce_loss": 2.293238, "epoch": 0.00855, "grad_norm": 0.002236641477793455, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051865, "kv_vq_loss": 0.000469, "learning_rate": 0.0009829915286820431, "loss": 0.052356, "step": 8550, "value_mse_loss_layer_000": 0.000587, "value_mse_loss_layer_001": 0.001724, "value_mse_loss_layer_002": 0.006104, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.061523, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000425, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.296951, "epoch": 0.00856, "grad_norm": 0.002356188138946891, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.052139, "kv_vq_loss": 0.000469, "learning_rate": 0.0009831184411692882, "loss": 0.052637, "step": 8560, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006744, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.009583, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015137, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.036865, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.33299, "epoch": 0.00857, "grad_norm": 0.0019372300012037158, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.05181, "kv_vq_loss": 0.000471, "learning_rate": 0.0009832452054807995, "loss": 0.05231, "step": 8570, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001701, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.01416, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.061523, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.00021, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001633, "vq_loss_layer_031": 0.003647 }, { "ce_loss": 2.271028, "epoch": 0.00858, "grad_norm": 0.001586254802532494, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.052084, "kv_vq_loss": 0.000477, "learning_rate": 0.0009833718219621763, "loss": 0.052585, "step": 8580, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001701, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011536, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.02771, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.060547, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000422, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000401, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.00164, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.285072, "epoch": 0.00859, "grad_norm": 0.002234033774584532, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.05199, "kv_vq_loss": 0.000464, "learning_rate": 0.0009834982909578103, "loss": 0.052481, "step": 8590, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001717, "value_mse_loss_layer_002": 0.006317, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013794, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.022583, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.028442, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.27177, "epoch": 0.0086, "grad_norm": 0.001802843064069748, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051892, "kv_vq_loss": 0.000489, "learning_rate": 0.0009836246128108919, "loss": 0.052396, "step": 8600, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001678, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009399, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014343, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.01532, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.062012, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.293204, "epoch": 0.00861, "grad_norm": 0.002269292948767543, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.051553, "kv_vq_loss": 0.000444, "learning_rate": 0.0009837507878634136, "loss": 0.052045, "step": 8610, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001709, "value_mse_loss_layer_002": 0.006012, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014709, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.032471, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.059326, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.332943, "epoch": 0.00862, "grad_norm": 0.002074260264635086, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051791, "kv_vq_loss": 0.000459, "learning_rate": 0.0009838768164561781, "loss": 0.052283, "step": 8620, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006256, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009277, "value_mse_loss_layer_006": 0.01123, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.014893, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.030518, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000138, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.00042, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003769 }, { "ce_loss": 2.30746, "epoch": 0.00863, "grad_norm": 0.002877625869587064, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051736, "kv_vq_loss": 0.000467, "learning_rate": 0.000984002698928802, "loss": 0.05224, "step": 8630, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001709, "value_mse_loss_layer_002": 0.006317, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.01532, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.03064, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.071289, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.062256, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000404, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000923, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.263158, "epoch": 0.00864, "grad_norm": 0.001902902266010642, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051797, "kv_vq_loss": 0.000467, "learning_rate": 0.0009841284356197231, "loss": 0.052289, "step": 8640, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.006409, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.009155, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015198, "value_mse_loss_layer_011": 0.016724, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.018677, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.062012, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 8.9e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000271, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000486, "vq_loss_layer_016": 0.000448, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000374, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.000462, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.26614, "epoch": 0.00865, "grad_norm": 0.0020466505084186792, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.052069, "kv_vq_loss": 0.000468, "learning_rate": 0.0009842540268662034, "loss": 0.05256, "step": 8650, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001678, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.010071, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.029907, "value_mse_loss_layer_023": 0.03064, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.060547, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.34424, "epoch": 0.00866, "grad_norm": 0.002902192762121558, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.051465, "kv_vq_loss": 0.000453, "learning_rate": 0.0009843794730043365, "loss": 0.051959, "step": 8660, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015381, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.023315, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.028076, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000467, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.302853, "epoch": 0.00867, "grad_norm": 0.002140800002962351, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.09082, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051895, "kv_vq_loss": 0.000458, "learning_rate": 0.0009845047743690526, "loss": 0.052396, "step": 8670, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001678, "value_mse_loss_layer_002": 0.006317, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.012268, "value_mse_loss_layer_008": 0.014282, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.017212, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.02002, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.024048, "value_mse_loss_layer_021": 0.027832, "value_mse_loss_layer_022": 0.02832, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.043701, "value_mse_loss_layer_026": 0.038574, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.072266, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 2.4e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 8.8e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000203, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000254, "vq_loss_layer_011": 0.000292, "vq_loss_layer_012": 0.000397, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000553, "vq_loss_layer_016": 0.00045, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000267, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000395, "vq_loss_layer_023": 0.000471, "vq_loss_layer_024": 0.000465, "vq_loss_layer_025": 0.000648, "vq_loss_layer_026": 0.00079, "vq_loss_layer_027": 0.000774, "vq_loss_layer_028": 0.001556, "vq_loss_layer_029": 0.00164, "vq_loss_layer_030": 0.003082, "vq_loss_layer_031": 0.005646 }, { "ce_loss": 2.277143, "epoch": 0.00868, "grad_norm": 0.002031947486102581, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.05181, "kv_vq_loss": 0.000479, "learning_rate": 0.0009846299312941229, "loss": 0.05231, "step": 8680, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.00164, "value_mse_loss_layer_002": 0.006073, "value_mse_loss_layer_003": 0.009827, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010925, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.025513, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.02771, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.064941, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.303906, "epoch": 0.00869, "grad_norm": 0.0024585502687841654, "key_mse_loss_layer_000": 0.002655, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.052002, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.052078, "kv_vq_loss": 0.000472, "learning_rate": 0.0009847549441121665, "loss": 0.052573, "step": 8690, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.021973, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.030518, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000213, "vq_loss_layer_021": 0.000402, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.293986, "epoch": 0.0087, "grad_norm": 0.0018928679637610912, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.051895, "kv_vq_loss": 0.000455, "learning_rate": 0.0009848798131546546, "loss": 0.052386, "step": 8700, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.006012, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008789, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.061279, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.282169, "epoch": 0.00871, "grad_norm": 0.0017923793056979775, "key_mse_loss_layer_000": 0.00267, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.052258, "kv_vq_loss": 0.000463, "learning_rate": 0.0009850045387519157, "loss": 0.052756, "step": 8710, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.009399, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.011719, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.01532, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.04248, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.062012, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000427, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.307352, "epoch": 0.00872, "grad_norm": 0.00230441614985466, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.052298, "kv_vq_loss": 0.000453, "learning_rate": 0.0009851291212331415, "loss": 0.052789, "step": 8720, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001678, "value_mse_loss_layer_002": 0.006989, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008728, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.015503, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.018677, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.024902, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.040283, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.059814, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.00045, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003616 }, { "ce_loss": 2.297673, "epoch": 0.00873, "grad_norm": 0.002416844479739666, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.052258, "kv_vq_loss": 0.000459, "learning_rate": 0.0009852535609263923, "loss": 0.052747, "step": 8730, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001678, "value_mse_loss_layer_002": 0.006531, "value_mse_loss_layer_003": 0.011292, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.011169, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.066895, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000142, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.003433 }, { "ce_loss": 2.258158, "epoch": 0.00874, "grad_norm": 0.0017915427451953292, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.052066, "kv_vq_loss": 0.000469, "learning_rate": 0.0009853778581586006, "loss": 0.052563, "step": 8740, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.00164, "value_mse_loss_layer_002": 0.005829, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.013794, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.022339, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.028564, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.059082, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.257451, "epoch": 0.00875, "grad_norm": 0.001979268854483962, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.052158, "kv_vq_loss": 0.000467, "learning_rate": 0.0009855020132555784, "loss": 0.052661, "step": 8750, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.35218, "epoch": 0.00876, "grad_norm": 0.002523822709918022, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051758, "kv_vq_loss": 0.000461, "learning_rate": 0.0009856260265420202, "loss": 0.052252, "step": 8760, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.028442, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000139, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.305831, "epoch": 0.00877, "grad_norm": 0.002290950622409582, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.051804, "kv_vq_loss": 0.000462, "learning_rate": 0.0009857498983415099, "loss": 0.052301, "step": 8770, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.006104, "value_mse_loss_layer_003": 0.010315, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.022095, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.313964, "epoch": 0.00878, "grad_norm": 0.0022058733738958836, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051498, "kv_vq_loss": 0.000452, "learning_rate": 0.0009858736289765253, "loss": 0.05199, "step": 8780, "value_mse_loss_layer_000": 0.000576, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008667, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.011292, "value_mse_loss_layer_008": 0.014038, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.022339, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.029541, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000389, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.344188, "epoch": 0.00879, "grad_norm": 0.0019148417050018907, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.051547, "kv_vq_loss": 0.000456, "learning_rate": 0.0009859972187684428, "loss": 0.052042, "step": 8790, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.014893, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.022095, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000199, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000269, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.334886, "epoch": 0.0088, "grad_norm": 0.0021579416934400797, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.051776, "kv_vq_loss": 0.000466, "learning_rate": 0.000986120668037542, "loss": 0.052267, "step": 8800, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001701, "value_mse_loss_layer_002": 0.0065, "value_mse_loss_layer_003": 0.011841, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.01178, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.015442, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.027344, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.00042, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.319561, "epoch": 0.00881, "grad_norm": 0.003900415264070034, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.054932, "key_mse_loss_layer_004": 0.063477, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051617, "kv_vq_loss": 0.000464, "learning_rate": 0.000986243977103012, "loss": 0.052112, "step": 8810, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006073, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008179, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.024658, "value_mse_loss_layer_019": 0.022095, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.028931, "value_mse_loss_layer_022": 0.027832, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.093262, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000298, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.001297, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.296824, "epoch": 0.00882, "grad_norm": 0.002400466240942478, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051944, "kv_vq_loss": 0.000458, "learning_rate": 0.0009863671462829549, "loss": 0.052432, "step": 8820, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015442, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.00038, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.003464 }, { "ce_loss": 2.32463, "epoch": 0.00883, "grad_norm": 0.0019641234539449215, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051721, "kv_vq_loss": 0.000451, "learning_rate": 0.000986490175894392, "loss": 0.052213, "step": 8830, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008789, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.022095, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.062012, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000236, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000399, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003937 }, { "ce_loss": 2.324718, "epoch": 0.00884, "grad_norm": 0.0018242686055600643, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.051547, "kv_vq_loss": 0.00046, "learning_rate": 0.0009866130662532682, "loss": 0.052045, "step": 8840, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.010193, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.015625, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.018677, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.025757, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.02771, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.03833, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.312701, "epoch": 0.00885, "grad_norm": 0.0022842646576464176, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051691, "kv_vq_loss": 0.000455, "learning_rate": 0.000986735817674456, "loss": 0.052179, "step": 8850, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.006256, "value_mse_loss_layer_003": 0.010071, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013794, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.024414, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.066895, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.060303, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000238, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001549, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.28771, "epoch": 0.00886, "grad_norm": 0.002419382566586137, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051898, "kv_vq_loss": 0.000462, "learning_rate": 0.0009868584304717626, "loss": 0.052396, "step": 8860, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.015198, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.022461, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.02002, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.02771, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.041748, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.061768, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000197, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000404, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.000664, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001297, "vq_loss_layer_030": 0.00351, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.325148, "epoch": 0.00887, "grad_norm": 0.0021444817539304495, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.051654, "kv_vq_loss": 0.000471, "learning_rate": 0.0009869809049579317, "loss": 0.052145, "step": 8870, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.005829, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013672, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015564, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.032715, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.042725, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000402, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.297847, "epoch": 0.00888, "grad_norm": 0.001991954632103443, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051575, "kv_vq_loss": 0.000459, "learning_rate": 0.00098710324144465, "loss": 0.052069, "step": 8880, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.006012, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.015991, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.023438, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.027832, "value_mse_loss_layer_022": 0.029053, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000226, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.000303, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000336, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.003479 }, { "ce_loss": 2.338594, "epoch": 0.00889, "grad_norm": 0.0018517025746405125, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.051559, "kv_vq_loss": 0.000451, "learning_rate": 0.0009872254402425531, "loss": 0.052051, "step": 8890, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.009338, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000209, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000395, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.310133, "epoch": 0.0089, "grad_norm": 0.0020932494662702084, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051752, "kv_vq_loss": 0.000447, "learning_rate": 0.0009873475016612281, "loss": 0.05224, "step": 8900, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015564, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019409, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.027832, "value_mse_loss_layer_022": 0.028931, "value_mse_loss_layer_023": 0.031128, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.292526, "epoch": 0.00891, "grad_norm": 0.002314075594767928, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.051999, "kv_vq_loss": 0.000515, "learning_rate": 0.0009874694260092187, "loss": 0.052512, "step": 8910, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.015137, "value_mse_loss_layer_011": 0.01532, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 8.7e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000243, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.284882, "epoch": 0.00892, "grad_norm": 0.0019078186014667153, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.051791, "kv_vq_loss": 0.000447, "learning_rate": 0.0009875912135940306, "loss": 0.052283, "step": 8920, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011292, "value_mse_loss_layer_008": 0.013794, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.018066, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.030518, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000458, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000214, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.001274, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.003784 }, { "ce_loss": 2.308871, "epoch": 0.00893, "grad_norm": 0.002300240332260728, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.051535, "kv_vq_loss": 0.000448, "learning_rate": 0.0009877128647221365, "loss": 0.052026, "step": 8930, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.00164, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009827, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008972, "value_mse_loss_layer_006": 0.011108, "value_mse_loss_layer_007": 0.012024, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.019043, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.019653, "value_mse_loss_layer_014": 0.019775, "value_mse_loss_layer_015": 0.022949, "value_mse_loss_layer_016": 0.019043, "value_mse_loss_layer_017": 0.021973, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.028809, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000364, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000408, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001602, "vq_loss_layer_031": 0.003616 }, { "ce_loss": 2.286346, "epoch": 0.00894, "grad_norm": 0.0018886678153648973, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051575, "kv_vq_loss": 0.000451, "learning_rate": 0.0009878343796989792, "loss": 0.052069, "step": 8940, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014893, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.031738, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000427, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.221942, "epoch": 0.00895, "grad_norm": 0.002569785574451089, "key_mse_loss_layer_000": 0.003647, "key_mse_loss_layer_001": 0.011841, "key_mse_loss_layer_002": 0.063965, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.109863, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.112305, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.118164, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.108398, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.110352, "key_mse_loss_layer_023": 0.108887, "key_mse_loss_layer_024": 0.087891, "key_mse_loss_layer_025": 0.084473, "key_mse_loss_layer_026": 0.100098, "key_mse_loss_layer_027": 0.10498, "key_mse_loss_layer_028": 0.10791, "key_mse_loss_layer_029": 0.101562, "key_mse_loss_layer_030": 0.11084, "key_mse_loss_layer_031": 0.082031, "kv_mse_loss": 0.051782, "kv_vq_loss": 0.000452, "learning_rate": 0.000987955758828978, "loss": 0.052277, "step": 8950, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001694, "value_mse_loss_layer_002": 0.006104, "value_mse_loss_layer_003": 0.01062, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009216, "value_mse_loss_layer_006": 0.010986, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.075684, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.067871, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.8e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 7.8e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000166, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000228, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000261, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000538, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000984, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.002502, "vq_loss_layer_030": 0.00351, "vq_loss_layer_031": 0.00589 }, { "ce_loss": 2.342913, "epoch": 0.00896, "grad_norm": 0.0020760681945830584, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.05957, "kv_mse_loss": 0.051868, "kv_vq_loss": 0.000485, "learning_rate": 0.0009880770024155313, "loss": 0.052365, "step": 8960, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.006226, "value_mse_loss_layer_003": 0.010071, "value_mse_loss_layer_004": 0.009827, "value_mse_loss_layer_005": 0.009033, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011902, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.019775, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018921, "value_mse_loss_layer_014": 0.018921, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000208, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000328, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.000406, "vq_loss_layer_013": 0.000364, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.00053, "vq_loss_layer_016": 0.000463, "vq_loss_layer_017": 0.000427, "vq_loss_layer_018": 0.000254, "vq_loss_layer_019": 0.000221, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000425, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000507, "vq_loss_layer_026": 0.000633, "vq_loss_layer_027": 0.00071, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.00293, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.339519, "epoch": 0.00897, "grad_norm": 0.0015483939787372947, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.051242, "kv_vq_loss": 0.000436, "learning_rate": 0.000988198110761023, "loss": 0.05173, "step": 8970, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.018311, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.028931, "value_mse_loss_layer_022": 0.027344, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000935, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.273145, "epoch": 0.00898, "grad_norm": 0.0019262570422142744, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051596, "kv_vq_loss": 0.000463, "learning_rate": 0.000988319084166826, "loss": 0.05209, "step": 8980, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.00589, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000332, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.00034, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.003464 }, { "ce_loss": 2.325692, "epoch": 0.00899, "grad_norm": 0.0016811219975352287, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.05166, "kv_vq_loss": 0.000461, "learning_rate": 0.0009884399229333073, "loss": 0.052155, "step": 8990, "value_mse_loss_layer_000": 0.00058, "value_mse_loss_layer_001": 0.001686, "value_mse_loss_layer_002": 0.006073, "value_mse_loss_layer_003": 0.010132, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.018921, "value_mse_loss_layer_010": 0.015076, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.061768, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.271117, "epoch": 0.009, "grad_norm": 0.0018511920934543014, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.085938, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.052084, "kv_vq_loss": 0.000456, "learning_rate": 0.000988560627359831, "loss": 0.052582, "step": 9000, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.015564, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.0271, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.029541, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.061523, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.279338, "epoch": 0.00901, "grad_norm": 0.001992926700040698, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.059814, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.095215, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.05155, "kv_vq_loss": 0.00046, "learning_rate": 0.0009886811977447658, "loss": 0.052048, "step": 9010, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001625, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000443, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.266746, "epoch": 0.00902, "grad_norm": 0.002431209199130535, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.051501, "kv_vq_loss": 0.000458, "learning_rate": 0.0009888016343854854, "loss": 0.051993, "step": 9020, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.00164, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.003235 }, { "ce_loss": 2.197328, "epoch": 0.00903, "grad_norm": 0.0019588929135352373, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.123535, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.052014, "kv_vq_loss": 0.000476, "learning_rate": 0.0009889219375783763, "loss": 0.052512, "step": 9030, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.006561, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.009399, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.015198, "value_mse_loss_layer_011": 0.015198, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.030151, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000203, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000479, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.004395 }, { "ce_loss": 2.28264, "epoch": 0.00904, "grad_norm": 0.0018097900319844484, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.0517, "kv_vq_loss": 0.000454, "learning_rate": 0.0009890421076188408, "loss": 0.052197, "step": 9040, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.011414, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.060303, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000196, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.003494 }, { "ce_loss": 2.265501, "epoch": 0.00905, "grad_norm": 0.0018538640579208732, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.052164, "kv_vq_loss": 0.000472, "learning_rate": 0.0009891621448013005, "loss": 0.052664, "step": 9050, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.00589, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.010254, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.01709, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018921, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.018433, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.023315, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.028687, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.042236, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.048096, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.072266, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 0.000101, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000397, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000203, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.30693, "epoch": 0.00906, "grad_norm": 0.0020491217728704214, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.0517, "kv_vq_loss": 0.000461, "learning_rate": 0.000989282049419203, "loss": 0.052194, "step": 9060, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.006104, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.018921, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.022095, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.059814, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000414, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.003311, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.283389, "epoch": 0.00907, "grad_norm": 0.0016413115663453937, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.051874, "kv_vq_loss": 0.000476, "learning_rate": 0.0009894018217650236, "loss": 0.052371, "step": 9070, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.00589, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014709, "value_mse_loss_layer_011": 0.015564, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021606, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.288807, "epoch": 0.00908, "grad_norm": 0.002344372682273388, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.093262, "key_mse_loss_layer_009": 0.098633, "key_mse_loss_layer_010": 0.114258, "key_mse_loss_layer_011": 0.108398, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.137695, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.121582, "key_mse_loss_layer_016": 0.108887, "key_mse_loss_layer_017": 0.109863, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.107422, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.060059, "kv_mse_loss": 0.051492, "kv_vq_loss": 0.000446, "learning_rate": 0.0009895214621302713, "loss": 0.051981, "step": 9080, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001625, "value_mse_loss_layer_002": 0.006104, "value_mse_loss_layer_003": 0.01001, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.019287, "value_mse_loss_layer_010": 0.015869, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019165, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000141, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000239, "vq_loss_layer_009": 0.000303, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000408, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000467, "vq_loss_layer_015": 0.000454, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.000456, "vq_loss_layer_022": 0.000406, "vq_loss_layer_023": 0.000452, "vq_loss_layer_024": 0.00042, "vq_loss_layer_025": 0.00071, "vq_loss_layer_026": 0.000626, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.300443, "epoch": 0.00909, "grad_norm": 0.001888573169708252, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.060547, "kv_mse_loss": 0.051224, "kv_vq_loss": 0.000444, "learning_rate": 0.0009896409708054917, "loss": 0.051715, "step": 9090, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.009521, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.013794, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.016235, "value_mse_loss_layer_012": 0.0177, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000223, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000458, "vq_loss_layer_015": 0.00046, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000854, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.284406, "epoch": 0.0091, "grad_norm": 0.002092099515721202, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051553, "kv_vq_loss": 0.00046, "learning_rate": 0.0009897603480802733, "loss": 0.052045, "step": 9100, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.014404, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.022827, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.028442, "value_mse_loss_layer_022": 0.02771, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.052246, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.061279, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.00021, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.00038, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.330839, "epoch": 0.00911, "grad_norm": 0.0021952688694000244, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.051514, "kv_vq_loss": 0.000436, "learning_rate": 0.0009898795942432494, "loss": 0.052002, "step": 9110, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008667, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.033447, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.288105, "epoch": 0.00912, "grad_norm": 0.002076275646686554, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.051389, "kv_vq_loss": 0.000458, "learning_rate": 0.000989998709582104, "loss": 0.05188, "step": 9120, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014893, "value_mse_loss_layer_011": 0.015198, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000395, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.332615, "epoch": 0.00913, "grad_norm": 0.0024016641546040773, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051123, "kv_vq_loss": 0.00044, "learning_rate": 0.0009901176943835745, "loss": 0.051611, "step": 9130, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001602, "value_mse_loss_layer_002": 0.006287, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013794, "value_mse_loss_layer_009": 0.018799, "value_mse_loss_layer_010": 0.015381, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019897, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.018555, "value_mse_loss_layer_017": 0.021851, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000238, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000412, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.316122, "epoch": 0.00914, "grad_norm": 0.002617651829496026, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.062012, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.097656, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.099121, "key_mse_loss_layer_030": 0.097168, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.051419, "kv_vq_loss": 0.00044, "learning_rate": 0.0009902365489334577, "loss": 0.05191, "step": 9140, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.006012, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.00946, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.010986, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.015198, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.020264, "value_mse_loss_layer_019": 0.022095, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.029541, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.032715, "value_mse_loss_layer_024": 0.037598, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.042725, "value_mse_loss_layer_027": 0.054932, "value_mse_loss_layer_028": 0.060547, "value_mse_loss_layer_029": 0.082031, "value_mse_loss_layer_030": 0.081055, "value_mse_loss_layer_031": 0.077148, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000227, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000211, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000328, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000694, "vq_loss_layer_028": 0.001289, "vq_loss_layer_029": 0.002106, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.005951 }, { "ce_loss": 2.312935, "epoch": 0.00915, "grad_norm": 0.002093774499371648, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051434, "kv_vq_loss": 0.000467, "learning_rate": 0.000990355273516612, "loss": 0.051932, "step": 9150, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001701, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.312938, "epoch": 0.00916, "grad_norm": 0.0019431761465966702, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.051553, "kv_vq_loss": 0.000453, "learning_rate": 0.0009904738684169624, "loss": 0.052045, "step": 9160, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.006836, "value_mse_loss_layer_003": 0.010742, "value_mse_loss_layer_004": 0.009277, "value_mse_loss_layer_005": 0.00946, "value_mse_loss_layer_006": 0.010864, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.018677, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 9.7e-05, "vq_loss_layer_006": 0.000145, "vq_loss_layer_007": 0.0002, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.00025, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000343, "vq_loss_layer_014": 0.000437, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000408, "vq_loss_layer_018": 0.000222, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.000458, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.000463, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.00079, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.282397, "epoch": 0.00917, "grad_norm": 0.002280261367559433, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.051364, "kv_vq_loss": 0.000461, "learning_rate": 0.0009905923339175052, "loss": 0.051855, "step": 9170, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.027344, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.316006, "epoch": 0.00918, "grad_norm": 0.0018638510955497622, "key_mse_loss_layer_000": 0.00267, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.092285, "key_mse_loss_layer_009": 0.101562, "key_mse_loss_layer_010": 0.115723, "key_mse_loss_layer_011": 0.110352, "key_mse_loss_layer_012": 0.084473, "key_mse_loss_layer_013": 0.149414, "key_mse_loss_layer_014": 0.146484, "key_mse_loss_layer_015": 0.130859, "key_mse_loss_layer_016": 0.126953, "key_mse_loss_layer_017": 0.126953, "key_mse_loss_layer_018": 0.133789, "key_mse_loss_layer_019": 0.10498, "key_mse_loss_layer_020": 0.124512, "key_mse_loss_layer_021": 0.117188, "key_mse_loss_layer_022": 0.12207, "key_mse_loss_layer_023": 0.115723, "key_mse_loss_layer_024": 0.090332, "key_mse_loss_layer_025": 0.084961, "key_mse_loss_layer_026": 0.104004, "key_mse_loss_layer_027": 0.09668, "key_mse_loss_layer_028": 0.10791, "key_mse_loss_layer_029": 0.092285, "key_mse_loss_layer_030": 0.106934, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.051904, "kv_vq_loss": 0.00047, "learning_rate": 0.0009907106703003104, "loss": 0.052402, "step": 9180, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.006134, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.009277, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.010681, "value_mse_loss_layer_007": 0.011658, "value_mse_loss_layer_008": 0.013916, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.016235, "value_mse_loss_layer_011": 0.016602, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.028564, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.028442, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.040283, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 9.2e-05, "vq_loss_layer_006": 0.00015, "vq_loss_layer_007": 0.00021, "vq_loss_layer_008": 0.000271, "vq_loss_layer_009": 0.000267, "vq_loss_layer_010": 0.000294, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000439, "vq_loss_layer_013": 0.000351, "vq_loss_layer_014": 0.000565, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000301, "vq_loss_layer_021": 0.000702, "vq_loss_layer_022": 0.00041, "vq_loss_layer_023": 0.000366, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.000694, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000633, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.294006, "epoch": 0.00919, "grad_norm": 0.0020345046650618315, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.09082, "key_mse_loss_layer_030": 0.098633, "key_mse_loss_layer_031": 0.088379, "kv_mse_loss": 0.05184, "kv_vq_loss": 0.000442, "learning_rate": 0.0009908288778465277, "loss": 0.052338, "step": 9190, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.015564, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.027954, "value_mse_loss_layer_022": 0.028442, "value_mse_loss_layer_023": 0.033691, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.04126, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.060303, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.001595, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.310532, "epoch": 0.0092, "grad_norm": 0.0018512561218813062, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010986, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.080078, "kv_mse_loss": 0.051691, "kv_vq_loss": 0.00044, "learning_rate": 0.0009909469568363887, "loss": 0.052176, "step": 9200, "value_mse_loss_layer_000": 0.000565, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.014221, "value_mse_loss_layer_009": 0.019165, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000299, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.004425 }, { "ce_loss": 2.284056, "epoch": 0.00921, "grad_norm": 0.0019676354713737965, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.051822, "kv_vq_loss": 0.000455, "learning_rate": 0.000991064907549212, "loss": 0.052313, "step": 9210, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.001671, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013367, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.015076, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.060303, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.326485, "epoch": 0.00922, "grad_norm": 0.0016493111616000533, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.108887, "key_mse_loss_layer_017": 0.109375, "key_mse_loss_layer_018": 0.116699, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.108887, "key_mse_loss_layer_021": 0.103516, "key_mse_loss_layer_022": 0.106445, "key_mse_loss_layer_023": 0.103027, "key_mse_loss_layer_024": 0.08252, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.099609, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.051804, "kv_vq_loss": 0.000449, "learning_rate": 0.0009911827302634072, "loss": 0.052298, "step": 9220, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.061279, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.323568, "epoch": 0.00923, "grad_norm": 0.0020562170539051294, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.051437, "kv_vq_loss": 0.000436, "learning_rate": 0.000991300425256478, "loss": 0.051926, "step": 9230, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013977, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014893, "value_mse_loss_layer_011": 0.01532, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.001595, "vq_loss_layer_031": 0.003464 }, { "ce_loss": 2.294767, "epoch": 0.00924, "grad_norm": 0.002607834292575717, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.051501, "kv_vq_loss": 0.000476, "learning_rate": 0.0009914179928050264, "loss": 0.052002, "step": 9240, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001602, "value_mse_loss_layer_002": 0.006012, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008728, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029541, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.00042, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.30772, "epoch": 0.00925, "grad_norm": 0.0019118183990940452, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051495, "kv_vq_loss": 0.000453, "learning_rate": 0.0009915354331847582, "loss": 0.05199, "step": 9250, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.00164, "value_mse_loss_layer_002": 0.005859, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.019043, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.062256, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.285967, "epoch": 0.00926, "grad_norm": 0.0018418056424707174, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.051285, "kv_vq_loss": 0.000437, "learning_rate": 0.0009916527466704835, "loss": 0.05177, "step": 9260, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001625, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.017944, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.030518, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.5e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000244, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000441, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.349222, "epoch": 0.00927, "grad_norm": 0.00195556553080678, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.051581, "kv_vq_loss": 0.000452, "learning_rate": 0.000991769933536124, "loss": 0.052072, "step": 9270, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.00164, "value_mse_loss_layer_002": 0.007294, "value_mse_loss_layer_003": 0.009827, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008789, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.013672, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.021973, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000205, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000977, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.305916, "epoch": 0.00928, "grad_norm": 0.0023747298400849104, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.051532, "kv_vq_loss": 0.00046, "learning_rate": 0.0009918869940547155, "loss": 0.052026, "step": 9280, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.031006, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.059082, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000406, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000412, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000511, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.00351, "vq_loss_layer_031": 0.003845 }, { "ce_loss": 2.298711, "epoch": 0.00929, "grad_norm": 0.002115085022523999, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.051691, "kv_vq_loss": 0.00046, "learning_rate": 0.0009920039284984103, "loss": 0.052179, "step": 9290, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010742, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.062012, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 7.5e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000143, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000854, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.317547, "epoch": 0.0093, "grad_norm": 0.0015779392560943961, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.051614, "kv_vq_loss": 0.000456, "learning_rate": 0.0009921207371384837, "loss": 0.052109, "step": 9300, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.010498, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.057129, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001732, "vq_loss_layer_031": 0.003647 }, { "ce_loss": 2.310687, "epoch": 0.00931, "grad_norm": 0.00190591043792665, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.051138, "kv_vq_loss": 0.000452, "learning_rate": 0.0009922374202453355, "loss": 0.051633, "step": 9310, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014404, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.018188, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.00053, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.003403, "vq_loss_layer_031": 0.003677 }, { "ce_loss": 2.346527, "epoch": 0.00932, "grad_norm": 0.0017940645338967443, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.124023, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.106445, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.051547, "kv_vq_loss": 0.000458, "learning_rate": 0.0009923539780884953, "loss": 0.052039, "step": 9320, "value_mse_loss_layer_000": 0.000591, "value_mse_loss_layer_001": 0.001625, "value_mse_loss_layer_002": 0.006317, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015381, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.060547, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000219, "vq_loss_layer_021": 0.000423, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.00042, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.300961, "epoch": 0.00933, "grad_norm": 0.0018854335648939013, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.051593, "kv_vq_loss": 0.000447, "learning_rate": 0.0009924704109366249, "loss": 0.052081, "step": 9330, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008179, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.018921, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.024902, "value_mse_loss_layer_020": 0.025146, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.028198, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.059326, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.303317, "epoch": 0.00934, "grad_norm": 0.0019978599157184362, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051883, "kv_vq_loss": 0.000497, "learning_rate": 0.000992586719057523, "loss": 0.05239, "step": 9340, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008667, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.015015, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.057129, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000198, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.00041, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.003082, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.312781, "epoch": 0.00935, "grad_norm": 0.0015627375105395913, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051251, "kv_vq_loss": 0.000447, "learning_rate": 0.0009927029027181292, "loss": 0.051743, "step": 9350, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013672, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.062012, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000486, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003723 }, { "ce_loss": 2.318494, "epoch": 0.00936, "grad_norm": 0.0019794106483459473, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.062256, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.051218, "kv_vq_loss": 0.000442, "learning_rate": 0.0009928189621845262, "loss": 0.051706, "step": 9360, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001625, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.016968, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018921, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.031006, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000154, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000362, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000199, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001846, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.281403, "epoch": 0.00937, "grad_norm": 0.0018649434205144644, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051562, "kv_vq_loss": 0.000459, "learning_rate": 0.0009929348977219443, "loss": 0.052057, "step": 9370, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005524, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010986, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.022583, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.023438, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.057861, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.00038, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000188, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.322399, "epoch": 0.00938, "grad_norm": 0.0014648573705926538, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051346, "kv_vq_loss": 0.000452, "learning_rate": 0.0009930507095947659, "loss": 0.051837, "step": 9380, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001656, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.009766, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.014282, "value_mse_loss_layer_009": 0.018555, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018799, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021851, "value_mse_loss_layer_016": 0.018311, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.022339, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.038818, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.054688, "value_mse_loss_layer_029": 0.070801, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.064453, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000208, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.00021, "vq_loss_layer_024": 0.000349, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.261587, "epoch": 0.00939, "grad_norm": 0.0021978651639074087, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.051382, "kv_vq_loss": 0.000485, "learning_rate": 0.0009931663980665276, "loss": 0.05188, "step": 9390, "value_mse_loss_layer_000": 0.000568, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.005524, "value_mse_loss_layer_003": 0.010986, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013672, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015991, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018921, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.030273, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.059082, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.001343, "vq_loss_layer_030": 0.003113, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.309979, "epoch": 0.0094, "grad_norm": 0.0021024078596383333, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051141, "kv_vq_loss": 0.000448, "learning_rate": 0.0009932819633999244, "loss": 0.05163, "step": 9400, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001602, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012939, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.041016, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003006 }, { "ce_loss": 2.313522, "epoch": 0.00941, "grad_norm": 0.0025678162928670645, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051111, "kv_vq_loss": 0.000429, "learning_rate": 0.0009933974058568141, "loss": 0.051599, "step": 9410, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.010071, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013855, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.066895, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000199, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000246, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000214, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.001923, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.272819, "epoch": 0.00942, "grad_norm": 0.002017635153606534, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.051492, "kv_vq_loss": 0.000445, "learning_rate": 0.0009935127256982192, "loss": 0.051984, "step": 9420, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.009766, "value_mse_loss_layer_005": 0.009094, "value_mse_loss_layer_006": 0.010498, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.01532, "value_mse_loss_layer_011": 0.015625, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.030396, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.057129, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 8.8e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000465, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.253108, "epoch": 0.00943, "grad_norm": 0.0016085094539448619, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.104004, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.08252, "key_mse_loss_layer_026": 0.092285, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.101562, "key_mse_loss_layer_029": 0.100098, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.051364, "kv_vq_loss": 0.000458, "learning_rate": 0.0009936279231843321, "loss": 0.051855, "step": 9430, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.010254, "value_mse_loss_layer_004": 0.009216, "value_mse_loss_layer_005": 0.008728, "value_mse_loss_layer_006": 0.010071, "value_mse_loss_layer_007": 0.011475, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.019531, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.017822, "value_mse_loss_layer_017": 0.022217, "value_mse_loss_layer_018": 0.02063, "value_mse_loss_layer_019": 0.022583, "value_mse_loss_layer_020": 0.025391, "value_mse_loss_layer_021": 0.030273, "value_mse_loss_layer_022": 0.030762, "value_mse_loss_layer_023": 0.035645, "value_mse_loss_layer_024": 0.039307, "value_mse_loss_layer_025": 0.046387, "value_mse_loss_layer_026": 0.050293, "value_mse_loss_layer_027": 0.057373, "value_mse_loss_layer_028": 0.065918, "value_mse_loss_layer_029": 0.081055, "value_mse_loss_layer_030": 0.083496, "value_mse_loss_layer_031": 0.075195, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000311, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.001244, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.00528 }, { "ce_loss": 2.283766, "epoch": 0.00944, "grad_norm": 0.0020931162871420383, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010986, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.106934, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051392, "kv_vq_loss": 0.000439, "learning_rate": 0.0009937429985745173, "loss": 0.05188, "step": 9440, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.043457, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.069824, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000217, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.312114, "epoch": 0.00945, "grad_norm": 0.0015547327930107713, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051111, "kv_vq_loss": 0.000444, "learning_rate": 0.0009938579521273157, "loss": 0.051605, "step": 9450, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010986, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015259, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000362, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001541, "vq_loss_layer_031": 0.003235 }, { "ce_loss": 2.338368, "epoch": 0.00946, "grad_norm": 0.0015919465804472566, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.062988, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.05152, "kv_vq_loss": 0.000459, "learning_rate": 0.000993972784100448, "loss": 0.052011, "step": 9460, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.018188, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.056152, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000414, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000185, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.288191, "epoch": 0.00947, "grad_norm": 0.0018507527420297265, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051297, "kv_vq_loss": 0.000445, "learning_rate": 0.0009940874947508183, "loss": 0.051788, "step": 9470, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013367, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.026123, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.061523, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001732, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.34624, "epoch": 0.00948, "grad_norm": 0.0018281899392604828, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.041016, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.057861, "kv_mse_loss": 0.051562, "kv_vq_loss": 0.000453, "learning_rate": 0.0009942020843345164, "loss": 0.052057, "step": 9480, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010803, "value_mse_loss_layer_007": 0.011597, "value_mse_loss_layer_008": 0.013672, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.01532, "value_mse_loss_layer_011": 0.016846, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.059814, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.059082, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000149, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000217, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.00029, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.0005, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000191, "vq_loss_layer_023": 0.000349, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.310694, "epoch": 0.00949, "grad_norm": 0.001812413102015853, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.09375, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.085449, "kv_mse_loss": 0.051941, "kv_vq_loss": 0.000447, "learning_rate": 0.0009943165531068232, "loss": 0.052429, "step": 9490, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001633, "value_mse_loss_layer_002": 0.005829, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.010986, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019897, "value_mse_loss_layer_019": 0.023071, "value_mse_loss_layer_020": 0.024658, "value_mse_loss_layer_021": 0.027832, "value_mse_loss_layer_022": 0.029541, "value_mse_loss_layer_023": 0.032959, "value_mse_loss_layer_024": 0.036621, "value_mse_loss_layer_025": 0.043213, "value_mse_loss_layer_026": 0.039795, "value_mse_loss_layer_027": 0.049805, "value_mse_loss_layer_028": 0.05542, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000299, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000486, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001297, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.331212, "epoch": 0.0095, "grad_norm": 0.002085244283080101, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.051404, "kv_vq_loss": 0.000435, "learning_rate": 0.0009944309013222117, "loss": 0.051892, "step": 9500, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005493, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.353974, "epoch": 0.00951, "grad_norm": 0.0022862053010612726, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.051489, "kv_vq_loss": 0.000458, "learning_rate": 0.0009945451292343534, "loss": 0.051984, "step": 9510, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005493, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.010071, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013733, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.015198, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018921, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.025513, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.060547, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000824, "vq_loss_layer_029": 0.001389, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.305999, "epoch": 0.00952, "grad_norm": 0.0027668005786836147, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.051196, "kv_vq_loss": 0.000432, "learning_rate": 0.0009946592370961185, "loss": 0.051685, "step": 9520, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001602, "value_mse_loss_layer_002": 0.005829, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.015381, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.068359, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.003677 }, { "ce_loss": 2.324827, "epoch": 0.00953, "grad_norm": 0.0019015661673620343, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051352, "kv_vq_loss": 0.000437, "learning_rate": 0.0009947732251595815, "loss": 0.05184, "step": 9530, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005829, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014404, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.35518, "epoch": 0.00954, "grad_norm": 0.0022866097278892994, "key_mse_loss_layer_000": 0.00383, "key_mse_loss_layer_001": 0.011658, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.10498, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.089355, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.051401, "kv_vq_loss": 0.00046, "learning_rate": 0.0009948870936760236, "loss": 0.051892, "step": 9540, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001648, "value_mse_loss_layer_002": 0.006042, "value_mse_loss_layer_003": 0.010681, "value_mse_loss_layer_004": 0.009644, "value_mse_loss_layer_005": 0.00885, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013367, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.019287, "value_mse_loss_layer_019": 0.022461, "value_mse_loss_layer_020": 0.024292, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.028076, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.045898, "value_mse_loss_layer_026": 0.043701, "value_mse_loss_layer_027": 0.053711, "value_mse_loss_layer_028": 0.059082, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.07666, "value_mse_loss_layer_031": 0.069336, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 4.3e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000433, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.001221, "vq_loss_layer_029": 0.001686, "vq_loss_layer_030": 0.003159, "vq_loss_layer_031": 0.005524 }, { "ce_loss": 2.314722, "epoch": 0.00955, "grad_norm": 0.0017827774863690138, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051013, "kv_vq_loss": 0.000428, "learning_rate": 0.0009950008428959364, "loss": 0.051501, "step": 9550, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.00164, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.038086, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.059326, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000147, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000397, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000622, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.334386, "epoch": 0.00956, "grad_norm": 0.002089417539536953, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051254, "kv_vq_loss": 0.000431, "learning_rate": 0.000995114473069025, "loss": 0.051736, "step": 9560, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010559, "value_mse_loss_layer_007": 0.012695, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.018188, "value_mse_loss_layer_010": 0.015442, "value_mse_loss_layer_011": 0.016113, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.019043, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.017944, "value_mse_loss_layer_017": 0.021729, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000236, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000431, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.003479 }, { "ce_loss": 2.32437, "epoch": 0.00957, "grad_norm": 0.001984845381230116, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.051208, "kv_vq_loss": 0.000456, "learning_rate": 0.0009952279844442106, "loss": 0.0517, "step": 9570, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.006195, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.013977, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.323178, "epoch": 0.00958, "grad_norm": 0.0018020792631432414, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.061035, "kv_mse_loss": 0.051544, "kv_vq_loss": 0.000448, "learning_rate": 0.000995341377269636, "loss": 0.052042, "step": 9580, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.006226, "value_mse_loss_layer_003": 0.010803, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.022095, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.0271, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.036621, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.00022, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000243, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000477, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000429, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000381, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001282, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.344511, "epoch": 0.00959, "grad_norm": 0.0019875923171639442, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.11084, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051453, "kv_vq_loss": 0.000471, "learning_rate": 0.0009954546517926658, "loss": 0.05195, "step": 9590, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.272508, "epoch": 0.0096, "grad_norm": 0.0014683788176625967, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051898, "kv_vq_loss": 0.000455, "learning_rate": 0.0009955678082598921, "loss": 0.052396, "step": 9600, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.010986, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.024658, "value_mse_loss_layer_020": 0.023804, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.02771, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.047607, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.00041, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.26498, "epoch": 0.00961, "grad_norm": 0.001973397098481655, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051282, "kv_vq_loss": 0.000445, "learning_rate": 0.0009956808469171361, "loss": 0.051776, "step": 9610, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005524, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.01532, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.019165, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.02771, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.066406, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000431, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000205, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000393, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000486, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.281249, "epoch": 0.00962, "grad_norm": 0.0018086639465764165, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.060547, "kv_mse_loss": 0.051682, "kv_vq_loss": 0.000455, "learning_rate": 0.0009957937680094532, "loss": 0.052173, "step": 9620, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001541, "value_mse_loss_layer_002": 0.005524, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013611, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015747, "value_mse_loss_layer_011": 0.015869, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.019287, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.059326, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000224, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000238, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000343, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.00037, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.329166, "epoch": 0.00963, "grad_norm": 0.001616300898604095, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.092285, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.051266, "kv_vq_loss": 0.000438, "learning_rate": 0.0009959065717811335, "loss": 0.051752, "step": 9630, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.022095, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021606, "value_mse_loss_layer_020": 0.025269, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.030273, "value_mse_loss_layer_024": 0.035645, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000511, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.318677, "epoch": 0.00964, "grad_norm": 0.0017218418652191758, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051309, "kv_vq_loss": 0.000441, "learning_rate": 0.0009960192584757075, "loss": 0.051801, "step": 9640, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001564, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014282, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.274384, "epoch": 0.00965, "grad_norm": 0.0018007740145549178, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.052048, "kv_vq_loss": 0.000464, "learning_rate": 0.0009961318283359482, "loss": 0.052548, "step": 9650, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.010681, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015198, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001923, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.267164, "epoch": 0.00966, "grad_norm": 0.0018462527077645063, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051489, "kv_vq_loss": 0.00045, "learning_rate": 0.0009962442816038733, "loss": 0.051987, "step": 9660, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005493, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.021362, "value_mse_loss_layer_019": 0.023926, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.272866, "epoch": 0.00967, "grad_norm": 0.0017235733103007078, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051663, "kv_vq_loss": 0.000446, "learning_rate": 0.0009963566185207501, "loss": 0.052161, "step": 9670, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000736, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.258423, "epoch": 0.00968, "grad_norm": 0.0017878460930660367, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.051675, "kv_vq_loss": 0.000453, "learning_rate": 0.0009964688393270984, "loss": 0.052167, "step": 9680, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.00592, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.026855, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.002609, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.272772, "epoch": 0.00969, "grad_norm": 0.0020599220879375935, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.051584, "kv_vq_loss": 0.000466, "learning_rate": 0.0009965809442626912, "loss": 0.052081, "step": 9690, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001663, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.016846, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000414, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.288876, "epoch": 0.0097, "grad_norm": 0.0024126574862748384, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051544, "kv_vq_loss": 0.00044, "learning_rate": 0.000996692933566561, "loss": 0.052029, "step": 9700, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001587, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008728, "value_mse_loss_layer_006": 0.01062, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.018433, "value_mse_loss_layer_010": 0.014893, "value_mse_loss_layer_011": 0.015564, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.018433, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.021729, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.021973, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.030884, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.059326, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 8.4e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.001221, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.320063, "epoch": 0.00971, "grad_norm": 0.0016633294289931655, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051718, "kv_vq_loss": 0.000457, "learning_rate": 0.000996804807477001, "loss": 0.052213, "step": 9710, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010986, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.015381, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000387, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.286464, "epoch": 0.00972, "grad_norm": 0.0022470951080322266, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051407, "kv_vq_loss": 0.000438, "learning_rate": 0.0009969165662315686, "loss": 0.051901, "step": 9720, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.053467, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.337782, "epoch": 0.00973, "grad_norm": 0.0018742908723652363, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051746, "kv_vq_loss": 0.000448, "learning_rate": 0.0009970282100670877, "loss": 0.052228, "step": 9730, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001587, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.009888, "value_mse_loss_layer_004": 0.009155, "value_mse_loss_layer_005": 0.008789, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011292, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.038086, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.060791, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000187, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000368, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.000961, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.312671, "epoch": 0.00974, "grad_norm": 0.002007151488214731, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051257, "kv_vq_loss": 0.000467, "learning_rate": 0.0009971397392196536, "loss": 0.051755, "step": 9740, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008179, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.013367, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014404, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.031128, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.044922, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.239122, "epoch": 0.00975, "grad_norm": 0.0014396790647879243, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.051834, "kv_vq_loss": 0.000432, "learning_rate": 0.000997251153924634, "loss": 0.052316, "step": 9750, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001602, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.010681, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.019287, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.351751, "epoch": 0.00976, "grad_norm": 0.0016576796770095825, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.051102, "kv_vq_loss": 0.000442, "learning_rate": 0.000997362454416673, "loss": 0.051596, "step": 9760, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005646, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000427, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001511, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.31618, "epoch": 0.00977, "grad_norm": 0.0022988144773989916, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.051227, "kv_vq_loss": 0.000453, "learning_rate": 0.000997473640929693, "loss": 0.051718, "step": 9770, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001587, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008667, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.010925, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.018311, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.003479 }, { "ce_loss": 2.322178, "epoch": 0.00978, "grad_norm": 0.0017216752748936415, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.011597, "key_mse_loss_layer_002": 0.063477, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.080566, "key_mse_loss_layer_008": 0.091309, "key_mse_loss_layer_009": 0.097656, "key_mse_loss_layer_010": 0.110352, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.107422, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.051096, "kv_vq_loss": 0.000437, "learning_rate": 0.0009975847136969001, "loss": 0.051587, "step": 9780, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001617, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.009094, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.010376, "value_mse_loss_layer_007": 0.011841, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.015564, "value_mse_loss_layer_011": 0.016479, "value_mse_loss_layer_012": 0.017212, "value_mse_loss_layer_013": 0.018555, "value_mse_loss_layer_014": 0.019653, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.7e-05, "vq_loss_layer_002": 2.7e-05, "vq_loss_layer_003": 4.2e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 9.3e-05, "vq_loss_layer_006": 0.000148, "vq_loss_layer_007": 0.000213, "vq_loss_layer_008": 0.000252, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000284, "vq_loss_layer_011": 0.000282, "vq_loss_layer_012": 0.000425, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000534, "vq_loss_layer_015": 0.000622, "vq_loss_layer_016": 0.000515, "vq_loss_layer_017": 0.000479, "vq_loss_layer_018": 0.000267, "vq_loss_layer_019": 0.000242, "vq_loss_layer_020": 0.000349, "vq_loss_layer_021": 0.00053, "vq_loss_layer_022": 0.000484, "vq_loss_layer_023": 0.000496, "vq_loss_layer_024": 0.000725, "vq_loss_layer_025": 0.000717, "vq_loss_layer_026": 0.000946, "vq_loss_layer_027": 0.001099, "vq_loss_layer_028": 0.001389, "vq_loss_layer_029": 0.001442, "vq_loss_layer_030": 0.003693, "vq_loss_layer_031": 0.006012 }, { "ce_loss": 2.271276, "epoch": 0.00979, "grad_norm": 0.0014709311071783304, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051453, "kv_vq_loss": 0.000454, "learning_rate": 0.0009976956729507841, "loss": 0.051941, "step": 9790, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.017456, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000278, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.000423, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001678, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.283038, "epoch": 0.0098, "grad_norm": 0.0017261956818401814, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051144, "kv_vq_loss": 0.000451, "learning_rate": 0.0009978065189231236, "loss": 0.051642, "step": 9800, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005402, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.01355, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.057129, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.00036, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.307837, "epoch": 0.00981, "grad_norm": 0.001967875985428691, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051187, "kv_vq_loss": 0.000432, "learning_rate": 0.000997917251844987, "loss": 0.051675, "step": 9810, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.059814, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000406, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000633, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.341893, "epoch": 0.00982, "grad_norm": 0.002364794723689556, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.051175, "kv_vq_loss": 0.000435, "learning_rate": 0.0009980278719467373, "loss": 0.051663, "step": 9820, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005646, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010681, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.013977, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.289672, "epoch": 0.00983, "grad_norm": 0.0014053870690986514, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051364, "kv_vq_loss": 0.000446, "learning_rate": 0.0009981383794580335, "loss": 0.051852, "step": 9830, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.021851, "value_mse_loss_layer_020": 0.023926, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.027344, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.304919, "epoch": 0.00984, "grad_norm": 0.0018090883968397975, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.096191, "key_mse_loss_layer_009": 0.100586, "key_mse_loss_layer_010": 0.116699, "key_mse_loss_layer_011": 0.111328, "key_mse_loss_layer_012": 0.08252, "key_mse_loss_layer_013": 0.146484, "key_mse_loss_layer_014": 0.139648, "key_mse_loss_layer_015": 0.126953, "key_mse_loss_layer_016": 0.121094, "key_mse_loss_layer_017": 0.119629, "key_mse_loss_layer_018": 0.12793, "key_mse_loss_layer_019": 0.102539, "key_mse_loss_layer_020": 0.115723, "key_mse_loss_layer_021": 0.108887, "key_mse_loss_layer_022": 0.115723, "key_mse_loss_layer_023": 0.117188, "key_mse_loss_layer_024": 0.095215, "key_mse_loss_layer_025": 0.085449, "key_mse_loss_layer_026": 0.104004, "key_mse_loss_layer_027": 0.099609, "key_mse_loss_layer_028": 0.106445, "key_mse_loss_layer_029": 0.094727, "key_mse_loss_layer_030": 0.108398, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051202, "kv_vq_loss": 0.000442, "learning_rate": 0.0009982487746078353, "loss": 0.051685, "step": 9840, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001549, "value_mse_loss_layer_002": 0.005981, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010254, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.015198, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000133, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000301, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000256, "vq_loss_layer_012": 0.000418, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000433, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.00033, "vq_loss_layer_024": 0.000349, "vq_loss_layer_025": 0.000549, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000706, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.003418, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.292289, "epoch": 0.00985, "grad_norm": 0.0018494436517357826, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.051544, "kv_vq_loss": 0.000464, "learning_rate": 0.0009983590576244027, "loss": 0.052045, "step": 9850, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.001602, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.012939, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.019531, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.020874, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000223, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000416, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000456, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.302697, "epoch": 0.00986, "grad_norm": 0.0019133646273985505, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.05155, "kv_vq_loss": 0.000453, "learning_rate": 0.0009984692287353025, "loss": 0.052048, "step": 9860, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003586 }, { "ce_loss": 2.247513, "epoch": 0.00987, "grad_norm": 0.0018142133485525846, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.051517, "kv_vq_loss": 0.000437, "learning_rate": 0.000998579288167409, "loss": 0.052008, "step": 9870, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001587, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014282, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.288051, "epoch": 0.00988, "grad_norm": 0.0016348811332136393, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.051236, "kv_vq_loss": 0.000459, "learning_rate": 0.000998689236146907, "loss": 0.051736, "step": 9880, "value_mse_loss_layer_000": 0.000557, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008179, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000199, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000885, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.291832, "epoch": 0.00989, "grad_norm": 0.0017416935879737139, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.051135, "kv_vq_loss": 0.000435, "learning_rate": 0.0009987990728992948, "loss": 0.05162, "step": 9890, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.056396, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.286686, "epoch": 0.0099, "grad_norm": 0.0015439679846167564, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.051294, "kv_vq_loss": 0.000451, "learning_rate": 0.0009989087986493872, "loss": 0.051785, "step": 9900, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001549, "value_mse_loss_layer_002": 0.005402, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.311941, "epoch": 0.00991, "grad_norm": 0.0014555378584191203, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.051321, "kv_vq_loss": 0.000475, "learning_rate": 0.0009990184136213186, "loss": 0.051819, "step": 9910, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001564, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 0.000103, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000324, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000725, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.333681, "epoch": 0.00992, "grad_norm": 0.0017001790693029761, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.051389, "kv_vq_loss": 0.000445, "learning_rate": 0.0009991279180385445, "loss": 0.051886, "step": 9920, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.006866, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.014099, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.017334, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019043, "value_mse_loss_layer_015": 0.022217, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.019165, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.028076, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000207, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.00045, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000568, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000231, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000201, "vq_loss_layer_025": 0.000242, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.173334, "epoch": 0.00993, "grad_norm": 0.002077948534861207, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.044189, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.051617, "kv_vq_loss": 0.000474, "learning_rate": 0.0009992373121238453, "loss": 0.052115, "step": 9930, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001564, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.011353, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.026978, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000223, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000389, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000729, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.005463 }, { "ce_loss": 2.290123, "epoch": 0.00994, "grad_norm": 0.0017298869788646698, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.089355, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.051727, "kv_vq_loss": 0.000447, "learning_rate": 0.0009993465960993282, "loss": 0.052216, "step": 9940, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001541, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.045654, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.061768, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000414, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.293392, "epoch": 0.00995, "grad_norm": 0.00196643965318799, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.110352, "key_mse_loss_layer_011": 0.10791, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.143555, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.125977, "key_mse_loss_layer_016": 0.119629, "key_mse_loss_layer_017": 0.118164, "key_mse_loss_layer_018": 0.123047, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.112305, "key_mse_loss_layer_021": 0.109375, "key_mse_loss_layer_022": 0.113281, "key_mse_loss_layer_023": 0.109375, "key_mse_loss_layer_024": 0.084473, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.092773, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.09668, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051535, "kv_vq_loss": 0.000446, "learning_rate": 0.0009994557701864313, "loss": 0.052026, "step": 9950, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.015137, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 8.1e-05, "vq_loss_layer_005": 8.5e-05, "vq_loss_layer_006": 0.000143, "vq_loss_layer_007": 0.000194, "vq_loss_layer_008": 0.000288, "vq_loss_layer_009": 0.00034, "vq_loss_layer_010": 0.00029, "vq_loss_layer_011": 0.000277, "vq_loss_layer_012": 0.000441, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.000488, "vq_loss_layer_015": 0.000441, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000242, "vq_loss_layer_021": 0.000523, "vq_loss_layer_022": 0.000349, "vq_loss_layer_023": 0.00045, "vq_loss_layer_024": 0.000425, "vq_loss_layer_025": 0.000645, "vq_loss_layer_026": 0.000854, "vq_loss_layer_027": 0.000786, "vq_loss_layer_028": 0.001152, "vq_loss_layer_029": 0.001183, "vq_loss_layer_030": 0.003754, "vq_loss_layer_031": 0.004791 }, { "ce_loss": 2.290028, "epoch": 0.00996, "grad_norm": 0.0017669729422777891, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.05159, "kv_vq_loss": 0.000455, "learning_rate": 0.0009995648346059246, "loss": 0.052087, "step": 9960, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001587, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010681, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.02417, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.310875, "epoch": 0.00997, "grad_norm": 0.0019348151981830597, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.097656, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.137695, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.123047, "key_mse_loss_layer_016": 0.116211, "key_mse_loss_layer_017": 0.117188, "key_mse_loss_layer_018": 0.124023, "key_mse_loss_layer_019": 0.100586, "key_mse_loss_layer_020": 0.115234, "key_mse_loss_layer_021": 0.111816, "key_mse_loss_layer_022": 0.115723, "key_mse_loss_layer_023": 0.117188, "key_mse_loss_layer_024": 0.092773, "key_mse_loss_layer_025": 0.087402, "key_mse_loss_layer_026": 0.102051, "key_mse_loss_layer_027": 0.102051, "key_mse_loss_layer_028": 0.109863, "key_mse_loss_layer_029": 0.098145, "key_mse_loss_layer_030": 0.10498, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.051239, "kv_vq_loss": 0.000431, "learning_rate": 0.0009996737895779137, "loss": 0.051727, "step": 9970, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001541, "value_mse_loss_layer_002": 0.007751, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.009521, "value_mse_loss_layer_005": 0.008911, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.01123, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.063965, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 2.2e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000203, "vq_loss_layer_009": 0.000273, "vq_loss_layer_010": 0.000265, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000376, "vq_loss_layer_024": 0.000341, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001266, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.005005 }, { "ce_loss": 2.307248, "epoch": 0.00998, "grad_norm": 0.0014425497502088547, "key_mse_loss_layer_000": 0.00386, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.05116, "kv_vq_loss": 0.000451, "learning_rate": 0.0009997826353218427, "loss": 0.051654, "step": 9980, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014587, "value_mse_loss_layer_011": 0.015564, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.021484, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.056396, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.265282, "epoch": 0.00999, "grad_norm": 0.001931705279275775, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.051315, "kv_vq_loss": 0.000435, "learning_rate": 0.0009998913720564956, "loss": 0.051801, "step": 9990, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014282, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.284838, "epoch": 0.01, "grad_norm": 0.0021320923697203398, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.051193, "kv_vq_loss": 0.000442, "learning_rate": 0.0009999999999999998, "loss": 0.051678, "step": 10000, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.028809, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000412, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.0005, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.289513, "epoch": 0.01001, "grad_norm": 0.0018899835413321853, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051367, "kv_vq_loss": 0.00046, "learning_rate": 0.001, "loss": 0.051862, "step": 10010, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001587, "value_mse_loss_layer_002": 0.005493, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.057861, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.293048, "epoch": 0.01002, "grad_norm": 0.0018212206196039915, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.053955, "key_mse_loss_layer_004": 0.060791, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.051007, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051495, "step": 10020, "value_mse_loss_layer_000": 0.000549, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.009705, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.057129, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.318088, "epoch": 0.01003, "grad_norm": 0.001589016756042838, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051389, "kv_vq_loss": 0.000456, "learning_rate": 0.001, "loss": 0.051877, "step": 10030, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008667, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013428, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.029541, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.061035, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.332535, "epoch": 0.01004, "grad_norm": 0.001751953735947609, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.051196, "kv_vq_loss": 0.000446, "learning_rate": 0.001, "loss": 0.051685, "step": 10040, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.013367, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014954, "value_mse_loss_layer_011": 0.015747, "value_mse_loss_layer_012": 0.017578, "value_mse_loss_layer_013": 0.018677, "value_mse_loss_layer_014": 0.018921, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000429, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.293916, "epoch": 0.01005, "grad_norm": 0.0018140451284125447, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051022, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051517, "step": 10050, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005402, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007782, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.027344, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001678, "vq_loss_layer_031": 0.003006 }, { "ce_loss": 2.284763, "epoch": 0.01006, "grad_norm": 0.001993292709812522, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.05144, "kv_vq_loss": 0.000457, "learning_rate": 0.001, "loss": 0.051932, "step": 10060, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001549, "value_mse_loss_layer_002": 0.005951, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.014282, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 8.1e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000418, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.323818, "epoch": 0.01007, "grad_norm": 0.0016864259960129857, "key_mse_loss_layer_000": 0.003845, "key_mse_loss_layer_001": 0.011108, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051279, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051767, "step": 10070, "value_mse_loss_layer_000": 0.000561, "value_mse_loss_layer_001": 0.00161, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.027588, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000446, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000648, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001244, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.004303 }, { "ce_loss": 2.321623, "epoch": 0.01008, "grad_norm": 0.0017949416069313884, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051001, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051486, "step": 10080, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007721, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.031128, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000235, "vq_loss_layer_026": 0.000364, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.328604, "epoch": 0.01009, "grad_norm": 0.0015747769502922893, "key_mse_loss_layer_000": 0.00383, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.051035, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051523, "step": 10090, "value_mse_loss_layer_000": 0.000553, "value_mse_loss_layer_001": 0.001595, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.305358, "epoch": 0.0101, "grad_norm": 0.0015919128200039268, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.051266, "kv_vq_loss": 0.000449, "learning_rate": 0.001, "loss": 0.051758, "step": 10100, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008179, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.02771, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.061768, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.059082, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000418, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000256, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.264328, "epoch": 0.01011, "grad_norm": 0.0018251966685056686, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.051337, "kv_vq_loss": 0.000444, "learning_rate": 0.001, "loss": 0.051825, "step": 10110, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001549, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.029785, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000414, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.003433 }, { "ce_loss": 2.314346, "epoch": 0.01012, "grad_norm": 0.0013850502436980605, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051224, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051712, "step": 10120, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001549, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008606, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.014282, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.003845 }, { "ce_loss": 2.269732, "epoch": 0.01013, "grad_norm": 0.0022567790001630783, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051495, "kv_vq_loss": 0.000465, "learning_rate": 0.001, "loss": 0.051993, "step": 10130, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010986, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.020996, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000402, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.003433 }, { "ce_loss": 2.330426, "epoch": 0.01014, "grad_norm": 0.001453496515750885, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.054199, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.051181, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051663, "step": 10140, "value_mse_loss_layer_000": 0.000546, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.002274, "vq_loss_layer_030": 0.002716, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.26858, "epoch": 0.01015, "grad_norm": 0.001773684867657721, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.072266, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.051382, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051868, "step": 10150, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.012939, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017944, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020996, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.021362, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.05542, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000423, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.343001, "epoch": 0.01016, "grad_norm": 0.0016455028671771288, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050815, "kv_vq_loss": 0.000439, "learning_rate": 0.001, "loss": 0.0513, "step": 10160, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.00528, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000136, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.324025, "epoch": 0.01017, "grad_norm": 0.0014932082267478108, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050977, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051453, "step": 10170, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.01532, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.019409, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.05542, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.00045, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.315672, "epoch": 0.01018, "grad_norm": 0.001914230640977621, "key_mse_loss_layer_000": 0.00267, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.041016, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.061279, "key_mse_loss_layer_007": 0.070801, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.080078, "key_mse_loss_layer_020": 0.088379, "key_mse_loss_layer_021": 0.083984, "key_mse_loss_layer_022": 0.084961, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.07373, "key_mse_loss_layer_030": 0.069336, "key_mse_loss_layer_031": 0.054932, "kv_mse_loss": 0.050894, "kv_vq_loss": 0.000449, "learning_rate": 0.001, "loss": 0.051392, "step": 10180, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014282, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018799, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.027344, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000222, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.00038, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000439, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000463, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000372, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.322642, "epoch": 0.01019, "grad_norm": 0.001659331494010985, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050891, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051379, "step": 10190, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009399, "value_mse_loss_layer_004": 0.008911, "value_mse_loss_layer_005": 0.008484, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.018066, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000433, "vq_loss_layer_016": 0.00041, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000687, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.286993, "epoch": 0.0102, "grad_norm": 0.0019614105112850666, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.05126, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051743, "step": 10200, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007721, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.056152, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000185, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000355, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.284238, "epoch": 0.01021, "grad_norm": 0.0016635468928143382, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.051199, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051688, "step": 10210, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001549, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.021484, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.00046, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.290778, "epoch": 0.01022, "grad_norm": 0.0018723688554018736, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.091309, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051126, "kv_vq_loss": 0.000458, "learning_rate": 0.001, "loss": 0.051627, "step": 10220, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.005646, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.011108, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.042969, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.064453, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000218, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.00041, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.001167, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.005127 }, { "ce_loss": 2.345553, "epoch": 0.01023, "grad_norm": 0.0020049619488418102, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.050763, "kv_vq_loss": 0.000435, "learning_rate": 0.001, "loss": 0.051251, "step": 10230, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.00528, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.018066, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.0177, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.05542, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000259, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.319567, "epoch": 0.01024, "grad_norm": 0.0017895267810672522, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.051157, "kv_vq_loss": 0.00043, "learning_rate": 0.001, "loss": 0.051639, "step": 10240, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.005402, "value_mse_loss_layer_003": 0.010071, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008545, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.016357, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000486, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003403 }, { "ce_loss": 2.324372, "epoch": 0.01025, "grad_norm": 0.0015051404479891062, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050858, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051337, "step": 10250, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.010315, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000135, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.001137, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.297793, "epoch": 0.01026, "grad_norm": 0.0018093893304467201, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.051285, "kv_vq_loss": 0.000474, "learning_rate": 0.001, "loss": 0.051773, "step": 10260, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001541, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.015198, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000223, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000239, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000406, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.00045, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000774, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.318554, "epoch": 0.01027, "grad_norm": 0.0015390762127935886, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050876, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051355, "step": 10270, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.005493, "value_mse_loss_layer_003": 0.010376, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010681, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.000311, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.000301, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000423, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001442, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.259865, "epoch": 0.01028, "grad_norm": 0.0019584153778851032, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.051398, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.05188, "step": 10280, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.026367, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.059814, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.057861, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.298051, "epoch": 0.01029, "grad_norm": 0.0016282881842926145, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051041, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051517, "step": 10290, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014832, "value_mse_loss_layer_011": 0.015381, "value_mse_loss_layer_012": 0.016602, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.056396, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000416, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001648, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.317781, "epoch": 0.0103, "grad_norm": 0.0014608962228521705, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.0625, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.051126, "kv_vq_loss": 0.00044, "learning_rate": 0.001, "loss": 0.051617, "step": 10300, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014404, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.02124, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.021118, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.021362, "value_mse_loss_layer_020": 0.023071, "value_mse_loss_layer_021": 0.029053, "value_mse_loss_layer_022": 0.027466, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000393, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.324715, "epoch": 0.01031, "grad_norm": 0.0013329086359590292, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050824, "kv_vq_loss": 0.000438, "learning_rate": 0.001, "loss": 0.051312, "step": 10310, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001579, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.013184, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 8.3e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.328157, "epoch": 0.01032, "grad_norm": 0.0021541262976825237, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051578, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.05206, "step": 10320, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005524, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.00037, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.287548, "epoch": 0.01033, "grad_norm": 0.0015940528828650713, "key_mse_loss_layer_000": 0.005157, "key_mse_loss_layer_001": 0.013245, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.097656, "key_mse_loss_layer_031": 0.083008, "kv_mse_loss": 0.051233, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051721, "step": 10330, "value_mse_loss_layer_000": 0.000572, "value_mse_loss_layer_001": 0.001602, "value_mse_loss_layer_002": 0.005585, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012939, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.014404, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.022705, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.030151, "value_mse_loss_layer_022": 0.027344, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.047363, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.067383, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 8e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000492, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.000759, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.002075, "vq_loss_layer_030": 0.003082, "vq_loss_layer_031": 0.004822 }, { "ce_loss": 2.277962, "epoch": 0.01034, "grad_norm": 0.0017222221940755844, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.051382, "kv_vq_loss": 0.000443, "learning_rate": 0.001, "loss": 0.051874, "step": 10340, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005554, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.008179, "value_mse_loss_layer_006": 0.010437, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013245, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.014465, "value_mse_loss_layer_011": 0.01532, "value_mse_loss_layer_012": 0.016968, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.019165, "value_mse_loss_layer_015": 0.021118, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020874, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000414, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.30609, "epoch": 0.01035, "grad_norm": 0.0013649265747517347, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.051364, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051849, "step": 10350, "value_mse_loss_layer_000": 0.000484, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016235, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.059814, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000236, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.000824, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.320735, "epoch": 0.01036, "grad_norm": 0.0025133066810667515, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.051221, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051709, "step": 10360, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005524, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.320058, "epoch": 0.01037, "grad_norm": 0.0016714731464162469, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.050885, "kv_vq_loss": 0.000445, "learning_rate": 0.001, "loss": 0.051376, "step": 10370, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.018188, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.277614, "epoch": 0.01038, "grad_norm": 0.0015320040984079242, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.051086, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.051569, "step": 10380, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001541, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.295682, "epoch": 0.01039, "grad_norm": 0.0016962006920948625, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051132, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051624, "step": 10390, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.019775, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.285546, "epoch": 0.0104, "grad_norm": 0.0015087173087522388, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.059814, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.051105, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051593, "step": 10400, "value_mse_loss_layer_000": 0.000538, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007782, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.292037, "epoch": 0.01041, "grad_norm": 0.0018128763185814023, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.051117, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051605, "step": 10410, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001549, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000458, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000839, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.312479, "epoch": 0.01042, "grad_norm": 0.0018940709996968508, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050876, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.051364, "step": 10420, "value_mse_loss_layer_000": 0.000534, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010681, "value_mse_loss_layer_008": 0.013123, "value_mse_loss_layer_009": 0.017822, "value_mse_loss_layer_010": 0.014709, "value_mse_loss_layer_011": 0.015503, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.018188, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.273962, "epoch": 0.01043, "grad_norm": 0.0014122080756351352, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.051227, "kv_vq_loss": 0.000447, "learning_rate": 0.001, "loss": 0.051706, "step": 10430, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.009949, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.016724, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.018433, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000423, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000261, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.294216, "epoch": 0.01044, "grad_norm": 0.0021997771691530943, "key_mse_loss_layer_000": 0.003708, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.051047, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051532, "step": 10440, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.013367, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.049072, "value_mse_loss_layer_026": 0.037842, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.071777, "value_mse_loss_layer_030": 0.070801, "value_mse_loss_layer_031": 0.061768, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.0002, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000391, "vq_loss_layer_025": 0.000408, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.003708 }, { "ce_loss": 2.254972, "epoch": 0.01045, "grad_norm": 0.0018357641529291868, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050867, "kv_vq_loss": 0.000455, "learning_rate": 0.001, "loss": 0.051355, "step": 10450, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005493, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.00024, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000402, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.315915, "epoch": 0.01046, "grad_norm": 0.001726604881696403, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.051013, "kv_vq_loss": 0.000441, "learning_rate": 0.001, "loss": 0.051498, "step": 10460, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013306, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.01416, "value_mse_loss_layer_011": 0.015259, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.018066, "value_mse_loss_layer_014": 0.018555, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.01709, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.062256, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003433 }, { "ce_loss": 2.295779, "epoch": 0.01047, "grad_norm": 0.0019370479276403785, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.05108, "kv_vq_loss": 0.000447, "learning_rate": 0.001, "loss": 0.051572, "step": 10470, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001564, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.009583, "value_mse_loss_layer_004": 0.008972, "value_mse_loss_layer_005": 0.008423, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010803, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017944, "value_mse_loss_layer_010": 0.013977, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.021118, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.031738, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.065918, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.35151, "epoch": 0.01048, "grad_norm": 0.0012888083001598716, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.050922, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.051404, "step": 10480, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001556, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.001625, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.352961, "epoch": 0.01049, "grad_norm": 0.0013976253103464842, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051056, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.051538, "step": 10490, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013977, "value_mse_loss_layer_011": 0.015442, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.027588, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000332, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.310272, "epoch": 0.0105, "grad_norm": 0.0016989066498354077, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050732, "kv_vq_loss": 0.00043, "learning_rate": 0.001, "loss": 0.051212, "step": 10500, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.028198, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.059814, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.294716, "epoch": 0.01051, "grad_norm": 0.0015228795818984509, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051166, "kv_vq_loss": 0.000439, "learning_rate": 0.001, "loss": 0.051651, "step": 10510, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.008301, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.363479, "epoch": 0.01052, "grad_norm": 0.0014555442612618208, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051114, "kv_vq_loss": 0.000445, "learning_rate": 0.001, "loss": 0.051605, "step": 10520, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008179, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012939, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.05542, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000483, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.0009, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.003403 }, { "ce_loss": 2.345988, "epoch": 0.01053, "grad_norm": 0.0014773850562050939, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.051068, "kv_vq_loss": 0.000427, "learning_rate": 0.001, "loss": 0.051547, "step": 10530, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.010071, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.299785, "epoch": 0.01054, "grad_norm": 0.0015575913712382317, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.05105, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051529, "step": 10540, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000368, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.326622, "epoch": 0.01055, "grad_norm": 0.0017851939192041755, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.053711, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050653, "kv_vq_loss": 0.000448, "learning_rate": 0.001, "loss": 0.051147, "step": 10550, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000237, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.325299, "epoch": 0.01056, "grad_norm": 0.0013113656314089894, "key_mse_loss_layer_000": 0.003876, "key_mse_loss_layer_001": 0.011108, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.051178, "kv_vq_loss": 0.000438, "learning_rate": 0.001, "loss": 0.051672, "step": 10560, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001534, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.010193, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000137, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000223, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.001282, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.277255, "epoch": 0.01057, "grad_norm": 0.002642285078763962, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.0508, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.051282, "step": 10570, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.056396, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.294638, "epoch": 0.01058, "grad_norm": 0.0017416944028809667, "key_mse_loss_layer_000": 0.002762, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.092285, "key_mse_loss_layer_009": 0.099121, "key_mse_loss_layer_010": 0.111816, "key_mse_loss_layer_011": 0.107422, "key_mse_loss_layer_012": 0.083008, "key_mse_loss_layer_013": 0.147461, "key_mse_loss_layer_014": 0.143555, "key_mse_loss_layer_015": 0.125977, "key_mse_loss_layer_016": 0.120605, "key_mse_loss_layer_017": 0.120605, "key_mse_loss_layer_018": 0.125977, "key_mse_loss_layer_019": 0.098633, "key_mse_loss_layer_020": 0.115234, "key_mse_loss_layer_021": 0.10791, "key_mse_loss_layer_022": 0.111328, "key_mse_loss_layer_023": 0.108887, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.081055, "key_mse_loss_layer_026": 0.094727, "key_mse_loss_layer_027": 0.089844, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.051285, "kv_vq_loss": 0.000443, "learning_rate": 0.001, "loss": 0.051773, "step": 10580, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015381, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 7.1e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000215, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000254, "vq_loss_layer_012": 0.000404, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000443, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000222, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.000309, "vq_loss_layer_021": 0.000465, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000349, "vq_loss_layer_024": 0.000368, "vq_loss_layer_025": 0.000576, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.325634, "epoch": 0.01059, "grad_norm": 0.001387244206853211, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.097168, "key_mse_loss_layer_010": 0.109863, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.134766, "key_mse_loss_layer_014": 0.130859, "key_mse_loss_layer_015": 0.119629, "key_mse_loss_layer_016": 0.114746, "key_mse_loss_layer_017": 0.11377, "key_mse_loss_layer_018": 0.123047, "key_mse_loss_layer_019": 0.099609, "key_mse_loss_layer_020": 0.11377, "key_mse_loss_layer_021": 0.106934, "key_mse_loss_layer_022": 0.111328, "key_mse_loss_layer_023": 0.111816, "key_mse_loss_layer_024": 0.089355, "key_mse_loss_layer_025": 0.083496, "key_mse_loss_layer_026": 0.100586, "key_mse_loss_layer_027": 0.097168, "key_mse_loss_layer_028": 0.104004, "key_mse_loss_layer_029": 0.092285, "key_mse_loss_layer_030": 0.104004, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051105, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051596, "step": 10590, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.005402, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010742, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000261, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.310555, "epoch": 0.0106, "grad_norm": 0.0014337360626086593, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05065, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051126, "step": 10600, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016846, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018677, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.323435, "epoch": 0.01061, "grad_norm": 0.0013478660257533193, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.061768, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.050854, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.051328, "step": 10610, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.031494, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000228, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.253908, "epoch": 0.01062, "grad_norm": 0.0018252779264003038, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.051526, "kv_vq_loss": 0.000449, "learning_rate": 0.001, "loss": 0.052014, "step": 10620, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.299656, "epoch": 0.01063, "grad_norm": 0.001388275995850563, "key_mse_loss_layer_000": 0.004578, "key_mse_loss_layer_001": 0.012573, "key_mse_loss_layer_002": 0.062256, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.06543, "key_mse_loss_layer_006": 0.075684, "key_mse_loss_layer_007": 0.081055, "key_mse_loss_layer_008": 0.09082, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.109863, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.080566, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.09668, "key_mse_loss_layer_020": 0.107422, "key_mse_loss_layer_021": 0.101562, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.095215, "key_mse_loss_layer_027": 0.099609, "key_mse_loss_layer_028": 0.100586, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.050757, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051242, "step": 10630, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001587, "value_mse_loss_layer_002": 0.005676, "value_mse_loss_layer_003": 0.009766, "value_mse_loss_layer_004": 0.009338, "value_mse_loss_layer_005": 0.008606, "value_mse_loss_layer_006": 0.010071, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.012878, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.018677, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.017334, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.040771, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.063477, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 7.2e-05, "vq_loss_layer_005": 8.2e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000183, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000257, "vq_loss_layer_010": 0.000256, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000452, "vq_loss_layer_015": 0.000469, "vq_loss_layer_016": 0.000458, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000248, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000441, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000515, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000813, "vq_loss_layer_027": 0.000969, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001625, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.005005 }, { "ce_loss": 2.28829, "epoch": 0.01064, "grad_norm": 0.0015453413361683488, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.0508, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.05127, "step": 10640, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.027466, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000243, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.297577, "epoch": 0.01065, "grad_norm": 0.0015276853227987885, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.051135, "kv_vq_loss": 0.000443, "learning_rate": 0.001, "loss": 0.05163, "step": 10650, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005768, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.059814, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000399, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.004608 }, { "ce_loss": 2.321397, "epoch": 0.01066, "grad_norm": 0.0014114909572526813, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.051035, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.051511, "step": 10660, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.003235 }, { "ce_loss": 2.310136, "epoch": 0.01067, "grad_norm": 0.0015060610603541136, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.063477, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.050891, "kv_vq_loss": 0.000439, "learning_rate": 0.001, "loss": 0.051379, "step": 10670, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.0177, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.0177, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.021362, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.02124, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000197, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.282024, "epoch": 0.01068, "grad_norm": 0.0015687342965975404, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.097656, "key_mse_loss_layer_031": 0.086914, "kv_mse_loss": 0.051016, "kv_vq_loss": 0.000439, "learning_rate": 0.001, "loss": 0.051505, "step": 10680, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001778, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.360614, "epoch": 0.01069, "grad_norm": 0.0012963571352884173, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050778, "kv_vq_loss": 0.00043, "learning_rate": 0.001, "loss": 0.05126, "step": 10690, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.00148, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.30966, "epoch": 0.0107, "grad_norm": 0.0014475510688498616, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.051422, "kv_vq_loss": 0.00045, "learning_rate": 0.001, "loss": 0.051917, "step": 10700, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000349, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000576, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.347076, "epoch": 0.01071, "grad_norm": 0.0016388933872804046, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.05101, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.051495, "step": 10710, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005096, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.01001, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.027222, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.060547, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.251546, "epoch": 0.01072, "grad_norm": 0.001745904330164194, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.142578, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.12207, "key_mse_loss_layer_016": 0.119141, "key_mse_loss_layer_017": 0.118164, "key_mse_loss_layer_018": 0.125, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.11084, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.110352, "key_mse_loss_layer_023": 0.108398, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.051471, "kv_vq_loss": 0.000453, "learning_rate": 0.001, "loss": 0.051962, "step": 10720, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.004272 }, { "ce_loss": 2.370031, "epoch": 0.01073, "grad_norm": 0.0014824363170191646, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.051141, "kv_vq_loss": 0.000441, "learning_rate": 0.001, "loss": 0.051636, "step": 10730, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.011169, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000195, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000301, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.350646, "epoch": 0.01074, "grad_norm": 0.0015036581316962838, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050644, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051114, "step": 10740, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.00528, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.28245, "epoch": 0.01075, "grad_norm": 0.001415823819115758, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.051053, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051544, "step": 10750, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.017456, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.026611, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.302565, "epoch": 0.01076, "grad_norm": 0.001861813012510538, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.059814, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050766, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051239, "step": 10760, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.056396, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000259, "vq_loss_layer_022": 0.000187, "vq_loss_layer_023": 0.000209, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000226, "vq_loss_layer_026": 0.000353, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000576, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.333391, "epoch": 0.01077, "grad_norm": 0.0017905381973832846, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050885, "kv_vq_loss": 0.00043, "learning_rate": 0.001, "loss": 0.051373, "step": 10770, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001511, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012817, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.018555, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.030273, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.059814, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000155, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000232, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.318275, "epoch": 0.01078, "grad_norm": 0.0015153016429394484, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050949, "kv_vq_loss": 0.000442, "learning_rate": 0.001, "loss": 0.05144, "step": 10780, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.00946, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.017578, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.309527, "epoch": 0.01079, "grad_norm": 0.001567011815495789, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050635, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051111, "step": 10790, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.005615, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008667, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.010498, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 7.3e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000146, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000214, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000226, "vq_loss_layer_011": 0.000243, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000359, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.326835, "epoch": 0.0108, "grad_norm": 0.0018258020281791687, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050839, "kv_vq_loss": 0.000439, "learning_rate": 0.001, "loss": 0.051321, "step": 10800, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.005402, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.306854, "epoch": 0.01081, "grad_norm": 0.00174066296312958, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050656, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.051129, "step": 10810, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.030396, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.30365, "epoch": 0.01082, "grad_norm": 0.0016935835592448711, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.106934, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050854, "kv_vq_loss": 0.000438, "learning_rate": 0.001, "loss": 0.051343, "step": 10820, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001465, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000847, "vq_loss_layer_029": 0.001183, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.265511, "epoch": 0.01083, "grad_norm": 0.0014118894468992949, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.051031, "kv_vq_loss": 0.000442, "learning_rate": 0.001, "loss": 0.051517, "step": 10830, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.015137, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.018311, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.018799, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.029541, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.049805, "value_mse_loss_layer_029": 0.061035, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000271, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.344835, "epoch": 0.01084, "grad_norm": 0.0016362464521080256, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.05087, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051358, "step": 10840, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.023193, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.033936, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.043945, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000961, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.261156, "epoch": 0.01085, "grad_norm": 0.0017570098862051964, "key_mse_loss_layer_000": 0.003799, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.051199, "kv_vq_loss": 0.000447, "learning_rate": 0.001, "loss": 0.051688, "step": 10850, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.00528, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007782, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.02124, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.030273, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.057861, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000161, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.295512, "epoch": 0.01086, "grad_norm": 0.0012627373216673732, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.050812, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.051291, "step": 10860, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.31438, "epoch": 0.01087, "grad_norm": 0.0016927490942180157, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.05083, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.0513, "step": 10870, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.277334, "epoch": 0.01088, "grad_norm": 0.0015856699319556355, "key_mse_loss_layer_000": 0.0047, "key_mse_loss_layer_001": 0.011658, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.103516, "key_mse_loss_layer_014": 0.101074, "key_mse_loss_layer_015": 0.091309, "key_mse_loss_layer_016": 0.084473, "key_mse_loss_layer_017": 0.088867, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.05069, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051172, "step": 10880, "value_mse_loss_layer_000": 0.000542, "value_mse_loss_layer_001": 0.001572, "value_mse_loss_layer_002": 0.005707, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.013, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.037354, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.050293, "value_mse_loss_layer_028": 0.053955, "value_mse_loss_layer_029": 0.072266, "value_mse_loss_layer_030": 0.075684, "value_mse_loss_layer_031": 0.062988, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000198, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000135, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000189, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.001404, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.004242 }, { "ce_loss": 2.31795, "epoch": 0.01089, "grad_norm": 0.0016295926179736853, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.111816, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.050757, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051245, "step": 10890, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001465, "value_mse_loss_layer_002": 0.00528, "value_mse_loss_layer_003": 0.009277, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013977, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.017578, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000206, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.000395, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000232, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000622, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.302525, "epoch": 0.0109, "grad_norm": 0.001533299102447927, "key_mse_loss_layer_000": 0.003708, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.05087, "kv_vq_loss": 0.000455, "learning_rate": 0.001, "loss": 0.051358, "step": 10900, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001526, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.013489, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016968, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.032227, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.05127, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.059326, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.001167, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.290387, "epoch": 0.01091, "grad_norm": 0.0021592630073428154, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050998, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051486, "step": 10910, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000216, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000614, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001534, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.32227, "epoch": 0.01092, "grad_norm": 0.0014557348331436515, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.050598, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.05108, "step": 10920, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.00148, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.31608, "epoch": 0.01093, "grad_norm": 0.0017793334554880857, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050562, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.051031, "step": 10930, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005402, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.022949, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.059082, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.300412, "epoch": 0.01094, "grad_norm": 0.0015763833653181791, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050891, "kv_vq_loss": 0.000457, "learning_rate": 0.001, "loss": 0.051389, "step": 10940, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.00145, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000397, "vq_loss_layer_028": 0.000622, "vq_loss_layer_029": 0.000687, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.303495, "epoch": 0.01095, "grad_norm": 0.0014056973159313202, "key_mse_loss_layer_000": 0.007874, "key_mse_loss_layer_001": 0.013367, "key_mse_loss_layer_002": 0.070801, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.070801, "key_mse_loss_layer_006": 0.076172, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.092773, "key_mse_loss_layer_009": 0.09668, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.10791, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.123535, "key_mse_loss_layer_015": 0.114746, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.115234, "key_mse_loss_layer_019": 0.097656, "key_mse_loss_layer_020": 0.110352, "key_mse_loss_layer_021": 0.105957, "key_mse_loss_layer_022": 0.108887, "key_mse_loss_layer_023": 0.106934, "key_mse_loss_layer_024": 0.089355, "key_mse_loss_layer_025": 0.083008, "key_mse_loss_layer_026": 0.100586, "key_mse_loss_layer_027": 0.103027, "key_mse_loss_layer_028": 0.10498, "key_mse_loss_layer_029": 0.095703, "key_mse_loss_layer_030": 0.109863, "key_mse_loss_layer_031": 0.081543, "kv_mse_loss": 0.05105, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051526, "step": 10950, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008667, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.016602, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 9.9e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000211, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000248, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000406, "vq_loss_layer_025": 0.000538, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000938, "vq_loss_layer_028": 0.001244, "vq_loss_layer_029": 0.001472, "vq_loss_layer_030": 0.003586, "vq_loss_layer_031": 0.004761 }, { "ce_loss": 2.307563, "epoch": 0.01096, "grad_norm": 0.001171354902908206, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050742, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.051208, "step": 10960, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.00528, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020874, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001595, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.336749, "epoch": 0.01097, "grad_norm": 0.0014934168430045247, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050842, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051324, "step": 10970, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.009521, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.333383, "epoch": 0.01098, "grad_norm": 0.0016229196917265654, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050806, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.051288, "step": 10980, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001495, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000355, "vq_loss_layer_027": 0.000416, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.00066, "vq_loss_layer_030": 0.001602, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.327788, "epoch": 0.01099, "grad_norm": 0.0016869459068402648, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050812, "kv_vq_loss": 0.000427, "learning_rate": 0.001, "loss": 0.051291, "step": 10990, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000399, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000207, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.300257, "epoch": 0.011, "grad_norm": 0.0015783022390678525, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.051019, "kv_vq_loss": 0.000435, "learning_rate": 0.001, "loss": 0.051492, "step": 11000, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.056152, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.285053, "epoch": 0.01101, "grad_norm": 0.0015764115378260612, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.05101, "kv_vq_loss": 0.00044, "learning_rate": 0.001, "loss": 0.051501, "step": 11010, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001465, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000961, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.320248, "epoch": 0.01102, "grad_norm": 0.0015163536882027984, "key_mse_loss_layer_000": 0.003998, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.09668, "key_mse_loss_layer_031": 0.086426, "kv_mse_loss": 0.050635, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.051102, "step": 11020, "value_mse_loss_layer_000": 0.000523, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.014648, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015991, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000336, "vq_loss_layer_024": 0.000391, "vq_loss_layer_025": 0.000523, "vq_loss_layer_026": 0.000698, "vq_loss_layer_027": 0.000889, "vq_loss_layer_028": 0.001633, "vq_loss_layer_029": 0.002762, "vq_loss_layer_030": 0.003845, "vq_loss_layer_031": 0.006287 }, { "ce_loss": 2.331334, "epoch": 0.01103, "grad_norm": 0.0016007334925234318, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050601, "kv_vq_loss": 0.000451, "learning_rate": 0.001, "loss": 0.051096, "step": 11030, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.009033, "value_mse_loss_layer_005": 0.007782, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.05542, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 8e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000275, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.299436, "epoch": 0.01104, "grad_norm": 0.001310491468757391, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.050861, "kv_vq_loss": 0.00043, "learning_rate": 0.001, "loss": 0.051346, "step": 11040, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001465, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014893, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000231, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.00037, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003769 }, { "ce_loss": 2.285515, "epoch": 0.01105, "grad_norm": 0.0013462088536471128, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050635, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051105, "step": 11050, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.00766, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.304795, "epoch": 0.01106, "grad_norm": 0.0014302267227321863, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050949, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.051425, "step": 11060, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.000378, "vq_loss_layer_027": 0.000418, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.308703, "epoch": 0.01107, "grad_norm": 0.0012901807203888893, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050882, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051361, "step": 11070, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007782, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013977, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.293981, "epoch": 0.01108, "grad_norm": 0.001252375659532845, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.099121, "key_mse_loss_layer_010": 0.111816, "key_mse_loss_layer_011": 0.108887, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.145508, "key_mse_loss_layer_014": 0.142578, "key_mse_loss_layer_015": 0.125, "key_mse_loss_layer_016": 0.123535, "key_mse_loss_layer_017": 0.123047, "key_mse_loss_layer_018": 0.128906, "key_mse_loss_layer_019": 0.100098, "key_mse_loss_layer_020": 0.117676, "key_mse_loss_layer_021": 0.109863, "key_mse_loss_layer_022": 0.115723, "key_mse_loss_layer_023": 0.114258, "key_mse_loss_layer_024": 0.088379, "key_mse_loss_layer_025": 0.08252, "key_mse_loss_layer_026": 0.097168, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.102539, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.108398, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.051251, "kv_vq_loss": 0.000441, "learning_rate": 0.001, "loss": 0.05173, "step": 11080, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.00766, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.017578, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000322, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000383, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000462, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.00036, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.00164, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.302456, "epoch": 0.01109, "grad_norm": 0.0015473384410142899, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.062256, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050952, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051428, "step": 11090, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.00148, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000261, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.276485, "epoch": 0.0111, "grad_norm": 0.0014175635296851397, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050775, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.05126, "step": 11100, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.299205, "epoch": 0.01111, "grad_norm": 0.0017464637057855725, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050925, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.05141, "step": 11110, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.009094, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009827, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.325097, "epoch": 0.01112, "grad_norm": 0.0016173162730410695, "key_mse_loss_layer_000": 0.004089, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051114, "kv_vq_loss": 0.000427, "learning_rate": 0.001, "loss": 0.05159, "step": 11120, "value_mse_loss_layer_000": 0.00053, "value_mse_loss_layer_001": 0.001503, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.0271, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.056152, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.000239, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000395, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000404, "vq_loss_layer_022": 0.000336, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000437, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001312, "vq_loss_layer_030": 0.003799, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.265882, "epoch": 0.01113, "grad_norm": 0.001702814712189138, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051035, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051514, "step": 11130, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013977, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000385, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.00071, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.305263, "epoch": 0.01114, "grad_norm": 0.001592930406332016, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050861, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051343, "step": 11140, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.00589, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.060059, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.00038, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001602, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.314886, "epoch": 0.01115, "grad_norm": 0.001381071750074625, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050748, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.051202, "step": 11150, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.00148, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007721, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.013062, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000207, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.333513, "epoch": 0.01116, "grad_norm": 0.0016908487305045128, "key_mse_loss_layer_000": 0.002487, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050967, "kv_vq_loss": 0.000438, "learning_rate": 0.001, "loss": 0.05145, "step": 11160, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005249, "value_mse_loss_layer_003": 0.009155, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.010132, "value_mse_loss_layer_007": 0.011047, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014771, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015442, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000186, "vq_loss_layer_008": 0.000205, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000626, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.000969, "vq_loss_layer_029": 0.001442, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.319937, "epoch": 0.01117, "grad_norm": 0.0014888779260218143, "key_mse_loss_layer_000": 0.004059, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.080078, "kv_mse_loss": 0.050891, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051355, "step": 11170, "value_mse_loss_layer_000": 0.000519, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007782, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.026001, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.041992, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.305099, "epoch": 0.01118, "grad_norm": 0.0014196888078004122, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050946, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.051431, "step": 11180, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001488, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000186, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.295811, "epoch": 0.01119, "grad_norm": 0.001697721891105175, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.051144, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.05163, "step": 11190, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001465, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.316981, "epoch": 0.0112, "grad_norm": 0.0013659611577168107, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050836, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051303, "step": 11200, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005096, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014526, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000622, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.349742, "epoch": 0.01121, "grad_norm": 0.001322977477684617, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050604, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051089, "step": 11210, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.00148, "value_mse_loss_layer_002": 0.005096, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.285069, "epoch": 0.01122, "grad_norm": 0.0015439327107742429, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.061279, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050787, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051245, "step": 11220, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.6e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 8.6e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000216, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000252, "vq_loss_layer_011": 0.000248, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000374, "vq_loss_layer_014": 0.000469, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000402, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.00022, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000486, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000698, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.292761, "epoch": 0.01123, "grad_norm": 0.0015209723496809602, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.050562, "kv_vq_loss": 0.000435, "learning_rate": 0.001, "loss": 0.051044, "step": 11230, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.00531, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.05835, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.001358, "vq_loss_layer_029": 0.001213, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.317638, "epoch": 0.01124, "grad_norm": 0.0012036063708364964, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050705, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.051175, "step": 11240, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.00148, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007721, "value_mse_loss_layer_006": 0.010071, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012695, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.015015, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.018066, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.017212, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.308075, "epoch": 0.01125, "grad_norm": 0.0016620768001303077, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.05061, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.051089, "step": 11250, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013855, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000212, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.315534, "epoch": 0.01126, "grad_norm": 0.0014214947586879134, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.050729, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051212, "step": 11260, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.00023, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.00023, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000416, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000387, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.288186, "epoch": 0.01127, "grad_norm": 0.001155088422819972, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050977, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.05145, "step": 11270, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001465, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.31209, "epoch": 0.01128, "grad_norm": 0.0018181658815592527, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050879, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051346, "step": 11280, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008301, "value_mse_loss_layer_005": 0.007782, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.30886, "epoch": 0.01129, "grad_norm": 0.0012525819474831223, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.051028, "kv_vq_loss": 0.00044, "learning_rate": 0.001, "loss": 0.051517, "step": 11290, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005371, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000221, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000404, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.343219, "epoch": 0.0113, "grad_norm": 0.0015780834946781397, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.051016, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.051501, "step": 11300, "value_mse_loss_layer_000": 0.000479, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.014343, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.018433, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016602, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.025757, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.311802, "epoch": 0.01131, "grad_norm": 0.001147970906458795, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050772, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051239, "step": 11310, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000203, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.001572, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.307227, "epoch": 0.01132, "grad_norm": 0.0017559458501636982, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050851, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.051328, "step": 11320, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.00145, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.016479, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.00042, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000866, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.330799, "epoch": 0.01133, "grad_norm": 0.0013278250116854906, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050464, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050919, "step": 11330, "value_mse_loss_layer_000": 0.000515, "value_mse_loss_layer_001": 0.001465, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.014038, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.023315, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000242, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.297509, "epoch": 0.01134, "grad_norm": 0.0015407581813633442, "key_mse_loss_layer_000": 0.003769, "key_mse_loss_layer_001": 0.011353, "key_mse_loss_layer_002": 0.061768, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050882, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.05134, "step": 11340, "value_mse_loss_layer_000": 0.000526, "value_mse_loss_layer_001": 0.001518, "value_mse_loss_layer_002": 0.005798, "value_mse_loss_layer_003": 0.009338, "value_mse_loss_layer_004": 0.00885, "value_mse_loss_layer_005": 0.00824, "value_mse_loss_layer_006": 0.009888, "value_mse_loss_layer_007": 0.010437, "value_mse_loss_layer_008": 0.012939, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.018188, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000232, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000239, "vq_loss_layer_019": 0.000185, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000364, "vq_loss_layer_025": 0.000492, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.000755, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001801, "vq_loss_layer_030": 0.002762, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.279164, "epoch": 0.01135, "grad_norm": 0.0011816424084827304, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.081055, "key_mse_loss_layer_008": 0.096191, "key_mse_loss_layer_009": 0.102539, "key_mse_loss_layer_010": 0.116211, "key_mse_loss_layer_011": 0.110352, "key_mse_loss_layer_012": 0.08252, "key_mse_loss_layer_013": 0.138672, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.125977, "key_mse_loss_layer_016": 0.116211, "key_mse_loss_layer_017": 0.115234, "key_mse_loss_layer_018": 0.123047, "key_mse_loss_layer_019": 0.102539, "key_mse_loss_layer_020": 0.115723, "key_mse_loss_layer_021": 0.112793, "key_mse_loss_layer_022": 0.115723, "key_mse_loss_layer_023": 0.116211, "key_mse_loss_layer_024": 0.095703, "key_mse_loss_layer_025": 0.089844, "key_mse_loss_layer_026": 0.106934, "key_mse_loss_layer_027": 0.11084, "key_mse_loss_layer_028": 0.115234, "key_mse_loss_layer_029": 0.104004, "key_mse_loss_layer_030": 0.10791, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.050961, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.05144, "step": 11350, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005432, "value_mse_loss_layer_003": 0.009216, "value_mse_loss_layer_004": 0.008789, "value_mse_loss_layer_005": 0.008118, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010864, "value_mse_loss_layer_008": 0.012756, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.014099, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.015869, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.025513, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.048584, "value_mse_loss_layer_028": 0.053223, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.059814, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.9e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000189, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.00025, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000374, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000364, "vq_loss_layer_024": 0.000401, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000751, "vq_loss_layer_028": 0.001442, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.003189, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.311197, "epoch": 0.01136, "grad_norm": 0.0015003870939835906, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.112793, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050696, "kv_vq_loss": 0.000427, "learning_rate": 0.001, "loss": 0.051175, "step": 11360, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.00528, "value_mse_loss_layer_003": 0.009033, "value_mse_loss_layer_004": 0.008362, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.009949, "value_mse_loss_layer_007": 0.010376, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.014221, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000261, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000534, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000364, "vq_loss_layer_025": 0.000456, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.261669, "epoch": 0.01137, "grad_norm": 0.0013276530662551522, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.051288, "kv_vq_loss": 0.000443, "learning_rate": 0.001, "loss": 0.05177, "step": 11370, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.00145, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000191, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.272053, "epoch": 0.01138, "grad_norm": 0.001282008015550673, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050876, "kv_vq_loss": 0.000447, "learning_rate": 0.001, "loss": 0.051364, "step": 11380, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.00145, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000408, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.288134, "epoch": 0.01139, "grad_norm": 0.0016121342778205872, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050827, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.051297, "step": 11390, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.001572, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.301957, "epoch": 0.0114, "grad_norm": 0.0012670555151998997, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.094727, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050424, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.0509, "step": 11400, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.00766, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001602, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.300234, "epoch": 0.01141, "grad_norm": 0.0017111141933128238, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050888, "kv_vq_loss": 0.000447, "learning_rate": 0.001, "loss": 0.051373, "step": 11410, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.00145, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012939, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.059814, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000239, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001358, "vq_loss_layer_030": 0.002792, "vq_loss_layer_031": 0.003754 }, { "ce_loss": 2.324599, "epoch": 0.01142, "grad_norm": 0.0015496904961764812, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.085449, "kv_mse_loss": 0.050809, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.051291, "step": 11420, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.014587, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016724, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.026855, "value_mse_loss_layer_023": 0.029907, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.30171, "epoch": 0.01143, "grad_norm": 0.0014158929698169231, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.050784, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.051263, "step": 11430, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001472, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000242, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.298205, "epoch": 0.01144, "grad_norm": 0.0014596503460779786, "key_mse_loss_layer_000": 0.002625, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.044189, "key_mse_loss_layer_004": 0.040771, "key_mse_loss_layer_005": 0.054932, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.058594, "kv_mse_loss": 0.050507, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.050989, "step": 11440, "value_mse_loss_layer_000": 0.000479, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000462, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000144, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.299236, "epoch": 0.01145, "grad_norm": 0.0012625254457816482, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050653, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.051123, "step": 11450, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.325766, "epoch": 0.01146, "grad_norm": 0.0012406010646373034, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050629, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.05108, "step": 11460, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.017212, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.015076, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.001678, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.309118, "epoch": 0.01147, "grad_norm": 0.0016670837067067623, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.05087, "kv_vq_loss": 0.000454, "learning_rate": 0.001, "loss": 0.05137, "step": 11470, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.015503, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.044434, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.354457, "epoch": 0.01148, "grad_norm": 0.0012133850250393152, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050381, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050821, "step": 11480, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.020996, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000228, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000416, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.363629, "epoch": 0.01149, "grad_norm": 0.001565716345794499, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.108887, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.082031, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.095215, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.050354, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050824, "step": 11490, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007935, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010315, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.014282, "value_mse_loss_layer_011": 0.014771, "value_mse_loss_layer_012": 0.016113, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.056396, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000254, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000387, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000374, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002777, "vq_loss_layer_031": 0.004944 }, { "ce_loss": 2.296131, "epoch": 0.0115, "grad_norm": 0.0014926957665011287, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050531, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.051016, "step": 11500, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.33848, "epoch": 0.01151, "grad_norm": 0.0013131701853126287, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050299, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050772, "step": 11510, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.374951, "epoch": 0.01152, "grad_norm": 0.001503675477579236, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050388, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.050867, "step": 11520, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.32618, "epoch": 0.01153, "grad_norm": 0.001348824822343886, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.060547, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050146, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050607, "step": 11530, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.01709, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000195, "vq_loss_layer_025": 0.000239, "vq_loss_layer_026": 0.000362, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000706, "vq_loss_layer_030": 0.001518, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.286463, "epoch": 0.01154, "grad_norm": 0.0015646859537810087, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.086426, "kv_mse_loss": 0.050577, "kv_vq_loss": 0.000427, "learning_rate": 0.001, "loss": 0.051056, "step": 11540, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.005096, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.007385, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.018311, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.041016, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.00106, "vq_loss_layer_029": 0.002182, "vq_loss_layer_030": 0.003433, "vq_loss_layer_031": 0.004974 }, { "ce_loss": 2.316852, "epoch": 0.01155, "grad_norm": 0.0013397138100117445, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.05051, "kv_vq_loss": 0.00044, "learning_rate": 0.001, "loss": 0.050995, "step": 11550, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007874, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012634, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020752, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.281484, "epoch": 0.01156, "grad_norm": 0.0012405985035002232, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050562, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.051025, "step": 11560, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.315314, "epoch": 0.01157, "grad_norm": 0.001731824828311801, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050735, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.051221, "step": 11570, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.005157, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000212, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.336476, "epoch": 0.01158, "grad_norm": 0.0015118541195988655, "key_mse_loss_layer_000": 0.013672, "key_mse_loss_layer_001": 0.02417, "key_mse_loss_layer_002": 0.072266, "key_mse_loss_layer_003": 0.064453, "key_mse_loss_layer_004": 0.064453, "key_mse_loss_layer_005": 0.076172, "key_mse_loss_layer_006": 0.091797, "key_mse_loss_layer_007": 0.088379, "key_mse_loss_layer_008": 0.094238, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.118164, "key_mse_loss_layer_011": 0.112793, "key_mse_loss_layer_012": 0.083496, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.125, "key_mse_loss_layer_016": 0.116211, "key_mse_loss_layer_017": 0.115723, "key_mse_loss_layer_018": 0.129883, "key_mse_loss_layer_019": 0.115723, "key_mse_loss_layer_020": 0.122559, "key_mse_loss_layer_021": 0.116699, "key_mse_loss_layer_022": 0.119141, "key_mse_loss_layer_023": 0.112305, "key_mse_loss_layer_024": 0.100098, "key_mse_loss_layer_025": 0.091797, "key_mse_loss_layer_026": 0.119141, "key_mse_loss_layer_027": 0.129883, "key_mse_loss_layer_028": 0.134766, "key_mse_loss_layer_029": 0.141602, "key_mse_loss_layer_030": 0.15918, "key_mse_loss_layer_031": 0.134766, "kv_mse_loss": 0.050378, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.050858, "step": 11580, "value_mse_loss_layer_000": 0.000706, "value_mse_loss_layer_001": 0.001564, "value_mse_loss_layer_002": 0.005737, "value_mse_loss_layer_003": 0.009644, "value_mse_loss_layer_004": 0.009399, "value_mse_loss_layer_005": 0.008362, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.019531, "value_mse_loss_layer_019": 0.021729, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.037842, "value_mse_loss_layer_025": 0.044678, "value_mse_loss_layer_026": 0.039062, "value_mse_loss_layer_027": 0.053955, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.072266, "value_mse_loss_layer_030": 0.085938, "value_mse_loss_layer_031": 0.068848, "vq_loss_layer_000": 1.4e-05, "vq_loss_layer_001": 1.9e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 7.4e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000246, "vq_loss_layer_011": 0.000241, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000458, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.00023, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000629, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000157, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000782, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.001266, "vq_loss_layer_028": 0.001747, "vq_loss_layer_029": 0.00238, "vq_loss_layer_030": 0.004944, "vq_loss_layer_031": 0.005768 }, { "ce_loss": 2.284071, "epoch": 0.01159, "grad_norm": 0.0012310727033764124, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051141, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.051617, "step": 11590, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.303029, "epoch": 0.0116, "grad_norm": 0.001533754519186914, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050558, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051019, "step": 11600, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.324067, "epoch": 0.01161, "grad_norm": 0.0017029318260028958, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.136719, "key_mse_loss_layer_014": 0.130859, "key_mse_loss_layer_015": 0.119141, "key_mse_loss_layer_016": 0.114258, "key_mse_loss_layer_017": 0.114258, "key_mse_loss_layer_018": 0.123535, "key_mse_loss_layer_019": 0.100098, "key_mse_loss_layer_020": 0.11377, "key_mse_loss_layer_021": 0.104492, "key_mse_loss_layer_022": 0.112305, "key_mse_loss_layer_023": 0.112793, "key_mse_loss_layer_024": 0.090332, "key_mse_loss_layer_025": 0.083008, "key_mse_loss_layer_026": 0.099609, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.100586, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050964, "kv_vq_loss": 0.000447, "learning_rate": 0.001, "loss": 0.05145, "step": 11610, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.008423, "value_mse_loss_layer_005": 0.007568, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.058838, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000322, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.00025, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000206, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.00174, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.301181, "epoch": 0.01162, "grad_norm": 0.0014828979037702084, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050555, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051031, "step": 11620, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.007385, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000197, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.33937, "epoch": 0.01163, "grad_norm": 0.0011882346589118242, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050238, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.050705, "step": 11630, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.299331, "epoch": 0.01164, "grad_norm": 0.0014922963455319405, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.05094, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.051416, "step": 11640, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.026245, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.304039, "epoch": 0.01165, "grad_norm": 0.0016007659723982215, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050717, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051196, "step": 11650, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001442, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.352227, "epoch": 0.01166, "grad_norm": 0.0012278605718165636, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050385, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.050858, "step": 11660, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007385, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000923, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.294532, "epoch": 0.01167, "grad_norm": 0.001294258632697165, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050812, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.051279, "step": 11670, "value_mse_loss_layer_000": 0.000484, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000191, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.001465, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.306333, "epoch": 0.01168, "grad_norm": 0.0016430087853223085, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050803, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051285, "step": 11680, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.00145, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.282072, "epoch": 0.01169, "grad_norm": 0.0012475040275603533, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.098633, "key_mse_loss_layer_031": 0.07959, "kv_mse_loss": 0.050833, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051303, "step": 11690, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007568, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.299144, "epoch": 0.0117, "grad_norm": 0.0017083307029679418, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050839, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.051291, "step": 11700, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000931, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.241145, "epoch": 0.01171, "grad_norm": 0.0012522181496024132, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.087402, "kv_mse_loss": 0.051129, "kv_vq_loss": 0.000467, "learning_rate": 0.001, "loss": 0.05162, "step": 11710, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.00145, "value_mse_loss_layer_002": 0.005463, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.018066, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000234, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000156, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.000977, "vq_loss_layer_029": 0.001701, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.004852 }, { "ce_loss": 2.297672, "epoch": 0.01172, "grad_norm": 0.0017331207636743784, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050345, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050787, "step": 11720, "value_mse_loss_layer_000": 0.000511, "value_mse_loss_layer_001": 0.001457, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007721, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022827, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.056152, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000961, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.279399, "epoch": 0.01173, "grad_norm": 0.0013255220837891102, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050925, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051407, "step": 11730, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.356627, "epoch": 0.01174, "grad_norm": 0.0013593218754976988, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050601, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.051065, "step": 11740, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.026611, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.271008, "epoch": 0.01175, "grad_norm": 0.001569724758155644, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103516, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.051028, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.051517, "step": 11750, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.025879, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.03064, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.316996, "epoch": 0.01176, "grad_norm": 0.0014721787301823497, "key_mse_loss_layer_000": 0.00473, "key_mse_loss_layer_001": 0.011963, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.050729, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.051196, "step": 11760, "value_mse_loss_layer_000": 0.000507, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008972, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013916, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.062012, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.060059, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000433, "vq_loss_layer_018": 0.000265, "vq_loss_layer_019": 0.000271, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.00042, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000362, "vq_loss_layer_025": 0.000416, "vq_loss_layer_026": 0.000923, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001442, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.318832, "epoch": 0.01177, "grad_norm": 0.0011402653763070703, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050885, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.051346, "step": 11770, "value_mse_loss_layer_000": 0.000484, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.338378, "epoch": 0.01178, "grad_norm": 0.001359758898615837, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.050888, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.051352, "step": 11780, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007812, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.026123, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001602, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.329009, "epoch": 0.01179, "grad_norm": 0.0014396480983123183, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.050641, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.051096, "step": 11790, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.00038, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.270173, "epoch": 0.0118, "grad_norm": 0.0016623861156404018, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.051285, "kv_vq_loss": 0.000434, "learning_rate": 0.001, "loss": 0.051764, "step": 11800, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.294768, "epoch": 0.01181, "grad_norm": 0.001371510443277657, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050693, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051154, "step": 11810, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001404, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007568, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.300982, "epoch": 0.01182, "grad_norm": 0.0014434789773076773, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050754, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051221, "step": 11820, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000161, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.299338, "epoch": 0.01183, "grad_norm": 0.0016715399688109756, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.072754, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.101074, "key_mse_loss_layer_010": 0.115723, "key_mse_loss_layer_011": 0.109375, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.144531, "key_mse_loss_layer_014": 0.139648, "key_mse_loss_layer_015": 0.130859, "key_mse_loss_layer_016": 0.122559, "key_mse_loss_layer_017": 0.12207, "key_mse_loss_layer_018": 0.126953, "key_mse_loss_layer_019": 0.104492, "key_mse_loss_layer_020": 0.119629, "key_mse_loss_layer_021": 0.116211, "key_mse_loss_layer_022": 0.119629, "key_mse_loss_layer_023": 0.116699, "key_mse_loss_layer_024": 0.092773, "key_mse_loss_layer_025": 0.089355, "key_mse_loss_layer_026": 0.105957, "key_mse_loss_layer_027": 0.105469, "key_mse_loss_layer_028": 0.111328, "key_mse_loss_layer_029": 0.099121, "key_mse_loss_layer_030": 0.108887, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050952, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.051434, "step": 11830, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.01062, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014526, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.055664, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 6.8e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000144, "vq_loss_layer_007": 0.00019, "vq_loss_layer_008": 0.000212, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000269, "vq_loss_layer_011": 0.000269, "vq_loss_layer_012": 0.000393, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000437, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000402, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000303, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000452, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.001282, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.002991, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.314757, "epoch": 0.01184, "grad_norm": 0.0012905207695439458, "key_mse_loss_layer_000": 0.002502, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.115723, "key_mse_loss_layer_016": 0.107422, "key_mse_loss_layer_017": 0.109375, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.060059, "kv_mse_loss": 0.0505, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.050989, "step": 11840, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.026489, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.027954, "value_mse_loss_layer_027": 0.0354, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.046631, "value_mse_loss_layer_030": 0.051514, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000235, "vq_loss_layer_010": 0.000222, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000439, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000338, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.003616 }, { "ce_loss": 2.319573, "epoch": 0.01185, "grad_norm": 0.0011962726712226868, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050839, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.051315, "step": 11850, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000412, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.00164, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.264946, "epoch": 0.01186, "grad_norm": 0.001238961354829371, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050491, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050949, "step": 11860, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000237, "vq_loss_layer_022": 0.000181, "vq_loss_layer_023": 0.000217, "vq_loss_layer_024": 0.000187, "vq_loss_layer_025": 0.000195, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000387, "vq_loss_layer_028": 0.000587, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001717, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.330581, "epoch": 0.01187, "grad_norm": 0.0015957814175635576, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050208, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.050684, "step": 11870, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.000162, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.0002, "vq_loss_layer_024": 0.000198, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000744, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.352907, "epoch": 0.01188, "grad_norm": 0.0013759775320068002, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050668, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051147, "step": 11880, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.348132, "epoch": 0.01189, "grad_norm": 0.0011651344830170274, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050378, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050824, "step": 11890, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001404, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.354645, "epoch": 0.0119, "grad_norm": 0.0014323672512546182, "key_mse_loss_layer_000": 0.003799, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050949, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.051428, "step": 11900, "value_mse_loss_layer_000": 0.0005, "value_mse_loss_layer_001": 0.001434, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.04126, "value_mse_loss_layer_026": 0.037109, "value_mse_loss_layer_027": 0.047852, "value_mse_loss_layer_028": 0.051758, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.057617, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.322992, "epoch": 0.01191, "grad_norm": 0.0010928072733804584, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050327, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.050793, "step": 11910, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.303988, "epoch": 0.01192, "grad_norm": 0.001753468532115221, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.050488, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.050955, "step": 11920, "value_mse_loss_layer_000": 0.000504, "value_mse_loss_layer_001": 0.001427, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.277105, "epoch": 0.01193, "grad_norm": 0.0012283403193578124, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050809, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051276, "step": 11930, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001396, "value_mse_loss_layer_002": 0.005035, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000136, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000401, "vq_loss_layer_028": 0.000599, "vq_loss_layer_029": 0.00071, "vq_loss_layer_030": 0.001465, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.310929, "epoch": 0.01194, "grad_norm": 0.0012563884956762195, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.055908, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07373, "key_mse_loss_layer_031": 0.059326, "kv_mse_loss": 0.050293, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050745, "step": 11940, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001404, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.290355, "epoch": 0.01195, "grad_norm": 0.0013801822206005454, "key_mse_loss_layer_000": 0.003769, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050851, "kv_vq_loss": 0.000445, "learning_rate": 0.001, "loss": 0.051343, "step": 11950, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001396, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007599, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.057861, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.276318, "epoch": 0.01196, "grad_norm": 0.0012304128613322973, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050616, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.051077, "step": 11960, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001396, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.320925, "epoch": 0.01197, "grad_norm": 0.0018694541649892926, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050504, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050961, "step": 11970, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001404, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.327661, "epoch": 0.01198, "grad_norm": 0.00117756228428334, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.072754, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.12207, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.103027, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050403, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050854, "step": 11980, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007996, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000217, "vq_loss_layer_011": 0.000244, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000299, "vq_loss_layer_021": 0.000401, "vq_loss_layer_022": 0.00033, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000381, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000706, "vq_loss_layer_027": 0.000847, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.001503, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.286909, "epoch": 0.01199, "grad_norm": 0.001389785436913371, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050494, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.05097, "step": 11990, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000178, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.00016, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.249546, "epoch": 0.012, "grad_norm": 0.0014405957190319896, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050522, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050989, "step": 12000, "value_mse_loss_layer_000": 0.000484, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.283156, "epoch": 0.01201, "grad_norm": 0.0014829003484919667, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050491, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.05097, "step": 12010, "value_mse_loss_layer_000": 0.000479, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.344805, "epoch": 0.01202, "grad_norm": 0.0011593321105465293, "key_mse_loss_layer_000": 0.003876, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050211, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050665, "step": 12020, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.034912, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.050537, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.066406, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000226, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00015, "vq_loss_layer_021": 0.000257, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.263076, "epoch": 0.01203, "grad_norm": 0.0014234906993806362, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050403, "kv_vq_loss": 0.000459, "learning_rate": 0.001, "loss": 0.050891, "step": 12030, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001419, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013794, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.325847, "epoch": 0.01204, "grad_norm": 0.001124901813454926, "key_mse_loss_layer_000": 0.002686, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.100098, "key_mse_loss_layer_010": 0.114746, "key_mse_loss_layer_011": 0.109863, "key_mse_loss_layer_012": 0.083984, "key_mse_loss_layer_013": 0.154297, "key_mse_loss_layer_014": 0.149414, "key_mse_loss_layer_015": 0.134766, "key_mse_loss_layer_016": 0.131836, "key_mse_loss_layer_017": 0.130859, "key_mse_loss_layer_018": 0.137695, "key_mse_loss_layer_019": 0.10791, "key_mse_loss_layer_020": 0.125977, "key_mse_loss_layer_021": 0.116699, "key_mse_loss_layer_022": 0.124512, "key_mse_loss_layer_023": 0.122559, "key_mse_loss_layer_024": 0.096191, "key_mse_loss_layer_025": 0.088379, "key_mse_loss_layer_026": 0.106934, "key_mse_loss_layer_027": 0.099121, "key_mse_loss_layer_028": 0.111816, "key_mse_loss_layer_029": 0.093262, "key_mse_loss_layer_030": 0.109375, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050815, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.051288, "step": 12040, "value_mse_loss_layer_000": 0.000452, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.005066, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.008545, "value_mse_loss_layer_005": 0.007568, "value_mse_loss_layer_006": 0.009583, "value_mse_loss_layer_007": 0.010254, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016968, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.021362, "value_mse_loss_layer_023": 0.023926, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.028564, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 7.9e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000188, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000292, "vq_loss_layer_010": 0.000239, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000216, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000401, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000343, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.277953, "epoch": 0.01205, "grad_norm": 0.0013444314245134592, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050653, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.051099, "step": 12050, "value_mse_loss_layer_000": 0.000479, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000341, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.317426, "epoch": 0.01206, "grad_norm": 0.0012895260006189346, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050385, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050845, "step": 12060, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002609, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.309271, "epoch": 0.01207, "grad_norm": 0.0015141504118219018, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050363, "kv_vq_loss": 0.000428, "learning_rate": 0.001, "loss": 0.050842, "step": 12070, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.012329, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000235, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.308609, "epoch": 0.01208, "grad_norm": 0.0011975112138316035, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.09375, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.09668, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050452, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.050912, "step": 12080, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001404, "value_mse_loss_layer_002": 0.005341, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.338056, "epoch": 0.01209, "grad_norm": 0.001221990562044084, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050699, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.051154, "step": 12090, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.016113, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000194, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001648, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.286945, "epoch": 0.0121, "grad_norm": 0.0013164477422833443, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050607, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051089, "step": 12100, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.022461, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028809, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.293507, "epoch": 0.01211, "grad_norm": 0.0015922380844131112, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050824, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051306, "step": 12110, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012268, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.321157, "epoch": 0.01212, "grad_norm": 0.0012157672317698598, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.108887, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.138672, "key_mse_loss_layer_014": 0.137695, "key_mse_loss_layer_015": 0.121094, "key_mse_loss_layer_016": 0.116211, "key_mse_loss_layer_017": 0.112793, "key_mse_loss_layer_018": 0.121094, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.110352, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.110352, "key_mse_loss_layer_023": 0.109863, "key_mse_loss_layer_024": 0.087891, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.097168, "key_mse_loss_layer_027": 0.098145, "key_mse_loss_layer_028": 0.102539, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.100098, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050702, "kv_vq_loss": 0.00043, "learning_rate": 0.001, "loss": 0.051175, "step": 12120, "value_mse_loss_layer_000": 0.000488, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000235, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00045, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000395, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000435, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.001198, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.348336, "epoch": 0.01213, "grad_norm": 0.001238119788467884, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050467, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050925, "step": 12130, "value_mse_loss_layer_000": 0.000475, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.005188, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000441, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.002502, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.275211, "epoch": 0.01214, "grad_norm": 0.0012612859718501568, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.050278, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050742, "step": 12140, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014832, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.00037, "vq_loss_layer_018": 0.00022, "vq_loss_layer_019": 0.000213, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000401, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000385, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.340969, "epoch": 0.01215, "grad_norm": 0.0011889563174918294, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.0504, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.050867, "step": 12150, "value_mse_loss_layer_000": 0.000496, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.017212, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000483, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.276431, "epoch": 0.01216, "grad_norm": 0.0012856748653575778, "key_mse_loss_layer_000": 0.002747, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.051758, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.062012, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050412, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.050873, "step": 12160, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.004547 }, { "ce_loss": 2.287909, "epoch": 0.01217, "grad_norm": 0.0013829225208610296, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050586, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.051062, "step": 12170, "value_mse_loss_layer_000": 0.000484, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.004913, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000159, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.346704, "epoch": 0.01218, "grad_norm": 0.0014021085808053613, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.098145, "key_mse_loss_layer_031": 0.09082, "kv_mse_loss": 0.050323, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.0508, "step": 12180, "value_mse_loss_layer_000": 0.000484, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.010132, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000179, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000267, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.001099, "vq_loss_layer_029": 0.001526, "vq_loss_layer_030": 0.003281, "vq_loss_layer_031": 0.004944 }, { "ce_loss": 2.283903, "epoch": 0.01219, "grad_norm": 0.0012329664314165711, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050528, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051004, "step": 12190, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001396, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007721, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014954, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000399, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000372, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.281902, "epoch": 0.0122, "grad_norm": 0.0014776228927075863, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.050476, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.05094, "step": 12200, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.240688, "epoch": 0.01221, "grad_norm": 0.0012827458558604121, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050607, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.051068, "step": 12210, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001503, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.290151, "epoch": 0.01222, "grad_norm": 0.0013817157596349716, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.09082, "key_mse_loss_layer_009": 0.097656, "key_mse_loss_layer_010": 0.110352, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.145508, "key_mse_loss_layer_014": 0.140625, "key_mse_loss_layer_015": 0.125, "key_mse_loss_layer_016": 0.119629, "key_mse_loss_layer_017": 0.119629, "key_mse_loss_layer_018": 0.124512, "key_mse_loss_layer_019": 0.097656, "key_mse_loss_layer_020": 0.112793, "key_mse_loss_layer_021": 0.105957, "key_mse_loss_layer_022": 0.11084, "key_mse_loss_layer_023": 0.108398, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.09375, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050449, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050912, "step": 12220, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.02417, "value_mse_loss_layer_024": 0.026489, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.028809, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.040527, "value_mse_loss_layer_029": 0.047607, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 7.5e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000201, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000225, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000376, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000408, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000523, "vq_loss_layer_026": 0.000603, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.001305, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.280599, "epoch": 0.01223, "grad_norm": 0.001253768801689148, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050671, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051151, "step": 12230, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000187, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000378, "vq_loss_layer_027": 0.000416, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001526, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.324692, "epoch": 0.01224, "grad_norm": 0.0013683958677574992, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.05032, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050781, "step": 12240, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000212, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.324642, "epoch": 0.01225, "grad_norm": 0.0012995131546631455, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.061035, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.050751, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.051199, "step": 12250, "value_mse_loss_layer_000": 0.000479, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000866, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.284201, "epoch": 0.01226, "grad_norm": 0.0014779495541006327, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.061768, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.051178, "kv_vq_loss": 0.000437, "learning_rate": 0.001, "loss": 0.051651, "step": 12260, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000364, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000607, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.346307, "epoch": 0.01227, "grad_norm": 0.0012091590324416757, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.0508, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.051248, "step": 12270, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.005127, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010559, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000192, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001068, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.267687, "epoch": 0.01228, "grad_norm": 0.0014379618223756552, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.063477, "key_mse_loss_layer_005": 0.066406, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.050601, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.051068, "step": 12280, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.01239, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.017944, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000254, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000236, "vq_loss_layer_022": 0.000168, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000228, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.001732, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.307733, "epoch": 0.01229, "grad_norm": 0.001262262579984963, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050403, "kv_vq_loss": 0.000436, "learning_rate": 0.001, "loss": 0.050891, "step": 12290, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001404, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008728, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014709, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025024, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.320574, "epoch": 0.0123, "grad_norm": 0.0012587375240400434, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050922, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.051392, "step": 12300, "value_mse_loss_layer_000": 0.000469, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007568, "value_mse_loss_layer_006": 0.009766, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000132, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000372, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.301226, "epoch": 0.01231, "grad_norm": 0.0012968761147931218, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050955, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.051382, "step": 12310, "value_mse_loss_layer_000": 0.000475, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001503, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.312214, "epoch": 0.01232, "grad_norm": 0.0017467897851020098, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.074219, "key_mse_loss_layer_031": 0.057373, "kv_mse_loss": 0.050635, "kv_vq_loss": 0.000444, "learning_rate": 0.001, "loss": 0.051117, "step": 12320, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.008728, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015747, "value_mse_loss_layer_013": 0.017822, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.020386, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.025269, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000349, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000462, "vq_loss_layer_016": 0.000374, "vq_loss_layer_017": 0.000406, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000408, "vq_loss_layer_022": 0.00032, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.004211, "vq_loss_layer_031": 0.004059 }, { "ce_loss": 2.290793, "epoch": 0.01233, "grad_norm": 0.0012701023370027542, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050595, "kv_vq_loss": 0.000432, "learning_rate": 0.001, "loss": 0.05108, "step": 12330, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.012512, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000393, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.000717, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.322761, "epoch": 0.01234, "grad_norm": 0.0011271403636783361, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.062012, "kv_mse_loss": 0.0505, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050958, "step": 12340, "value_mse_loss_layer_000": 0.000475, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.001549, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.332482, "epoch": 0.01235, "grad_norm": 0.0012857880210503936, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050186, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.05062, "step": 12350, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015381, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017944, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.026978, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.336141, "epoch": 0.01236, "grad_norm": 0.0013443098869174719, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050067, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050543, "step": 12360, "value_mse_loss_layer_000": 0.000475, "value_mse_loss_layer_001": 0.00135, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000427, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.343711, "epoch": 0.01237, "grad_norm": 0.001398542895913124, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050043, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.050522, "step": 12370, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013733, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.020386, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.352068, "epoch": 0.01238, "grad_norm": 0.001047872588969767, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.050381, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050845, "step": 12380, "value_mse_loss_layer_000": 0.000486, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000193, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003403 }, { "ce_loss": 2.374152, "epoch": 0.01239, "grad_norm": 0.0013884082436561584, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.061523, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.094727, "key_mse_loss_layer_009": 0.100098, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.108887, "key_mse_loss_layer_012": 0.08252, "key_mse_loss_layer_013": 0.140625, "key_mse_loss_layer_014": 0.134766, "key_mse_loss_layer_015": 0.123047, "key_mse_loss_layer_016": 0.120605, "key_mse_loss_layer_017": 0.115234, "key_mse_loss_layer_018": 0.123535, "key_mse_loss_layer_019": 0.099609, "key_mse_loss_layer_020": 0.11377, "key_mse_loss_layer_021": 0.105957, "key_mse_loss_layer_022": 0.114746, "key_mse_loss_layer_023": 0.115234, "key_mse_loss_layer_024": 0.094727, "key_mse_loss_layer_025": 0.085938, "key_mse_loss_layer_026": 0.103516, "key_mse_loss_layer_027": 0.101074, "key_mse_loss_layer_028": 0.109375, "key_mse_loss_layer_029": 0.09668, "key_mse_loss_layer_030": 0.11084, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050568, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.051016, "step": 12390, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001289, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.016968, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.02124, "value_mse_loss_layer_022": 0.021484, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000261, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000511, "vq_loss_layer_026": 0.000614, "vq_loss_layer_027": 0.000748, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001335, "vq_loss_layer_030": 0.003128, "vq_loss_layer_031": 0.004639 }, { "ce_loss": 2.287297, "epoch": 0.0124, "grad_norm": 0.001135534606873989, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050296, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050754, "step": 12400, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.313924, "epoch": 0.01241, "grad_norm": 0.0014483809936791658, "key_mse_loss_layer_000": 0.002731, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050009, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.050467, "step": 12410, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000195, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.000412, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.313278, "epoch": 0.01242, "grad_norm": 0.001008406514301896, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.109375, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.115723, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.10498, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.089844, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.095215, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050537, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.051016, "step": 12420, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000324, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.328427, "epoch": 0.01243, "grad_norm": 0.0013617543736472726, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.061523, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050076, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050522, "step": 12430, "value_mse_loss_layer_000": 0.000484, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020752, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.282042, "epoch": 0.01244, "grad_norm": 0.0017308946698904037, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050702, "kv_vq_loss": 0.000435, "learning_rate": 0.001, "loss": 0.051187, "step": 12440, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.054932, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000238, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000158, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000389, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.00037, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000935, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.333194, "epoch": 0.01245, "grad_norm": 0.0011092664208263159, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050314, "kv_vq_loss": 0.000432, "learning_rate": 0.001, "loss": 0.050793, "step": 12450, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.299854, "epoch": 0.01246, "grad_norm": 0.0012570723192766309, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.05051, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050986, "step": 12460, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004974, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.336454, "epoch": 0.01247, "grad_norm": 0.001192705356515944, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050119, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050565, "step": 12470, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007385, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001625, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.319343, "epoch": 0.01248, "grad_norm": 0.0016060199122875929, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050418, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.050879, "step": 12480, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007721, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012573, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000229, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000198, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.003403, "vq_loss_layer_031": 0.004303 }, { "ce_loss": 2.289281, "epoch": 0.01249, "grad_norm": 0.001084761111997068, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050131, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.050583, "step": 12490, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000153, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000208, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.272439, "epoch": 0.0125, "grad_norm": 0.0012615774758160114, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050159, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050583, "step": 12500, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013611, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000614, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001503, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.331816, "epoch": 0.01251, "grad_norm": 0.001288911560550332, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050558, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.051044, "step": 12510, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.005219, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000238, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000977, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.349847, "epoch": 0.01252, "grad_norm": 0.001137292361818254, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.07373, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.05007, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050516, "step": 12520, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000402, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000717, "vq_loss_layer_030": 0.001465, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.339528, "epoch": 0.01253, "grad_norm": 0.0014354283921420574, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050159, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050595, "step": 12530, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.336632, "epoch": 0.01254, "grad_norm": 0.0013623797567561269, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050146, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.050629, "step": 12540, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.00135, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000385, "vq_loss_layer_017": 0.000425, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.30869, "epoch": 0.01255, "grad_norm": 0.0013545744586735964, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.05054, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.051019, "step": 12550, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.00135, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.298838, "epoch": 0.01256, "grad_norm": 0.0011771919671446085, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050406, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050851, "step": 12560, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000355, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000201, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000408, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.293958, "epoch": 0.01257, "grad_norm": 0.0014860733645036817, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050601, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.051056, "step": 12570, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.290451, "epoch": 0.01258, "grad_norm": 0.0013519355561584234, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050354, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050812, "step": 12580, "value_mse_loss_layer_000": 0.000469, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.020264, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.026123, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.206866, "epoch": 0.01259, "grad_norm": 0.0013698944821953773, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.077637, "kv_mse_loss": 0.05065, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051108, "step": 12590, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.00036, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000194, "vq_loss_layer_025": 0.000232, "vq_loss_layer_026": 0.000349, "vq_loss_layer_027": 0.000395, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.001587, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.298021, "epoch": 0.0126, "grad_norm": 0.0013060561614111066, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050488, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.050958, "step": 12600, "value_mse_loss_layer_000": 0.000462, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.264839, "epoch": 0.01261, "grad_norm": 0.0012884289026260376, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.054199, "key_mse_loss_layer_004": 0.060547, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050519, "kv_vq_loss": 0.000427, "learning_rate": 0.001, "loss": 0.050992, "step": 12610, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.014648, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.00024, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.32129, "epoch": 0.01262, "grad_norm": 0.0015874445671215653, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050461, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050916, "step": 12620, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.320398, "epoch": 0.01263, "grad_norm": 0.0010786002967506647, "key_mse_loss_layer_000": 0.002487, "key_mse_loss_layer_001": 0.009521, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.043457, "key_mse_loss_layer_004": 0.040039, "key_mse_loss_layer_005": 0.054688, "key_mse_loss_layer_006": 0.061523, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.055908, "kv_mse_loss": 0.050314, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050766, "step": 12630, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.014893, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.05127, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000228, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000416, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.000429, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000378, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.003677 }, { "ce_loss": 2.306457, "epoch": 0.01264, "grad_norm": 0.001288877916522324, "key_mse_loss_layer_000": 0.003937, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.106934, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050406, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050858, "step": 12640, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.035156, "value_mse_loss_layer_027": 0.045898, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000136, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.371529, "epoch": 0.01265, "grad_norm": 0.001341201364994049, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.050262, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.050726, "step": 12650, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000198, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000401, "vq_loss_layer_017": 0.000391, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000824, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.302835, "epoch": 0.01266, "grad_norm": 0.0010810464154928923, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050089, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050534, "step": 12660, "value_mse_loss_layer_000": 0.000481, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.34607, "epoch": 0.01267, "grad_norm": 0.0017221334856003523, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.099121, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.050253, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.050708, "step": 12670, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.00032, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000313, "vq_loss_layer_024": 0.000378, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002792, "vq_loss_layer_031": 0.004547 }, { "ce_loss": 2.31617, "epoch": 0.01268, "grad_norm": 0.0011196020059287548, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050449, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.0509, "step": 12680, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007629, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.010193, "value_mse_loss_layer_008": 0.012146, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.020874, "value_mse_loss_layer_020": 0.02356, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.030029, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.039551, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000477, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001511, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.288915, "epoch": 0.01269, "grad_norm": 0.0013546602567657828, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050284, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.050751, "step": 12690, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.305844, "epoch": 0.0127, "grad_norm": 0.001469266484491527, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049976, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050436, "step": 12700, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.277109, "epoch": 0.01271, "grad_norm": 0.0013493308797478676, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.050348, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050812, "step": 12710, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007477, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000235, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000957, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.003403 }, { "ce_loss": 2.325546, "epoch": 0.01272, "grad_norm": 0.0011393999448046088, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050296, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050754, "step": 12720, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013428, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.289875, "epoch": 0.01273, "grad_norm": 0.001318779424764216, "key_mse_loss_layer_000": 0.003708, "key_mse_loss_layer_001": 0.011108, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.05062, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.051096, "step": 12730, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001411, "value_mse_loss_layer_002": 0.004944, "value_mse_loss_layer_003": 0.008911, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.062256, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.059814, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000185, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.001259, "vq_loss_layer_029": 0.001488, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.382187, "epoch": 0.01274, "grad_norm": 0.0012153194984421134, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050357, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050821, "step": 12740, "value_mse_loss_layer_000": 0.000462, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.00946, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.020508, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000479, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000357, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.293247, "epoch": 0.01275, "grad_norm": 0.0013738584239035845, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.092773, "key_mse_loss_layer_009": 0.098633, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.10791, "key_mse_loss_layer_012": 0.080566, "key_mse_loss_layer_013": 0.149414, "key_mse_loss_layer_014": 0.144531, "key_mse_loss_layer_015": 0.12793, "key_mse_loss_layer_016": 0.12793, "key_mse_loss_layer_017": 0.125, "key_mse_loss_layer_018": 0.133789, "key_mse_loss_layer_019": 0.102539, "key_mse_loss_layer_020": 0.118652, "key_mse_loss_layer_021": 0.113281, "key_mse_loss_layer_022": 0.121094, "key_mse_loss_layer_023": 0.117188, "key_mse_loss_layer_024": 0.093262, "key_mse_loss_layer_025": 0.084961, "key_mse_loss_layer_026": 0.103027, "key_mse_loss_layer_027": 0.097656, "key_mse_loss_layer_028": 0.105469, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.106934, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050723, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051184, "step": 12750, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.016357, "value_mse_loss_layer_016": 0.013245, "value_mse_loss_layer_017": 0.01709, "value_mse_loss_layer_018": 0.014709, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.020508, "value_mse_loss_layer_022": 0.020752, "value_mse_loss_layer_023": 0.022217, "value_mse_loss_layer_024": 0.025269, "value_mse_loss_layer_025": 0.031738, "value_mse_loss_layer_026": 0.026733, "value_mse_loss_layer_027": 0.034668, "value_mse_loss_layer_028": 0.039307, "value_mse_loss_layer_029": 0.04541, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000515, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.003067, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.32237, "epoch": 0.01276, "grad_norm": 0.0014954841462895274, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050287, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050732, "step": 12760, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.00135, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025635, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.313191, "epoch": 0.01277, "grad_norm": 0.001528109540231526, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.055908, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.081543, "key_mse_loss_layer_010": 0.093262, "key_mse_loss_layer_011": 0.092773, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.104492, "key_mse_loss_layer_014": 0.102539, "key_mse_loss_layer_015": 0.091797, "key_mse_loss_layer_016": 0.083984, "key_mse_loss_layer_017": 0.086914, "key_mse_loss_layer_018": 0.094238, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050452, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.050912, "step": 12770, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.059326, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000277, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000158, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000204, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.283931, "epoch": 0.01278, "grad_norm": 0.0011027447180822492, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050351, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.050818, "step": 12780, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.00135, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.012207, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000349, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.001472, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.325951, "epoch": 0.01279, "grad_norm": 0.0011343329679220915, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.050128, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050552, "step": 12790, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015991, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000399, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.00023, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.001648, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.277114, "epoch": 0.0128, "grad_norm": 0.0015050198417156935, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.05025, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050711, "step": 12800, "value_mse_loss_layer_000": 0.000479, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.020386, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.325235, "epoch": 0.01281, "grad_norm": 0.0010823418851941824, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.134766, "key_mse_loss_layer_014": 0.131836, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.111816, "key_mse_loss_layer_017": 0.111328, "key_mse_loss_layer_018": 0.117676, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.108398, "key_mse_loss_layer_021": 0.101562, "key_mse_loss_layer_022": 0.105957, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.091309, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.09668, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050134, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.050586, "step": 12810, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.00135, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000422, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000235, "vq_loss_layer_019": 0.000202, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000366, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003937 }, { "ce_loss": 2.275127, "epoch": 0.01282, "grad_norm": 0.0012632922735065222, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050732, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.05116, "step": 12820, "value_mse_loss_layer_000": 0.000469, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.022705, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.305898, "epoch": 0.01283, "grad_norm": 0.0013607617001980543, "key_mse_loss_layer_000": 0.004669, "key_mse_loss_layer_001": 0.014343, "key_mse_loss_layer_002": 0.062988, "key_mse_loss_layer_003": 0.054443, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.068848, "key_mse_loss_layer_006": 0.08252, "key_mse_loss_layer_007": 0.080566, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.098633, "key_mse_loss_layer_020": 0.109375, "key_mse_loss_layer_021": 0.104004, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.097656, "key_mse_loss_layer_027": 0.098145, "key_mse_loss_layer_028": 0.102539, "key_mse_loss_layer_029": 0.099609, "key_mse_loss_layer_030": 0.108398, "key_mse_loss_layer_031": 0.099121, "kv_mse_loss": 0.050897, "kv_vq_loss": 0.000431, "learning_rate": 0.001, "loss": 0.051358, "step": 12830, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.005005, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.008484, "value_mse_loss_layer_005": 0.008057, "value_mse_loss_layer_006": 0.009644, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.046875, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.071777, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 6.7e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000324, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000156, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000162, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.00074, "vq_loss_layer_026": 0.001045, "vq_loss_layer_027": 0.000717, "vq_loss_layer_028": 0.001137, "vq_loss_layer_029": 0.001518, "vq_loss_layer_030": 0.002914, "vq_loss_layer_031": 0.005829 }, { "ce_loss": 2.315573, "epoch": 0.01284, "grad_norm": 0.0011177418055012822, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050262, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050687, "step": 12840, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.276023, "epoch": 0.01285, "grad_norm": 0.0012258257484063506, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.050491, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.050955, "step": 12850, "value_mse_loss_layer_000": 0.000462, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.007385, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016479, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000406, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000418, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.290863, "epoch": 0.01286, "grad_norm": 0.001311817904934287, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050549, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051001, "step": 12860, "value_mse_loss_layer_000": 0.000483, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008545, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007385, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.001305, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003235 }, { "ce_loss": 2.315481, "epoch": 0.01287, "grad_norm": 0.001027748454362154, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050531, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.05097, "step": 12870, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.305536, "epoch": 0.01288, "grad_norm": 0.0014535743976011872, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.05076, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.051218, "step": 12880, "value_mse_loss_layer_000": 0.000469, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009949, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016724, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000219, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.00037, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.298889, "epoch": 0.01289, "grad_norm": 0.0011716425651684403, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.050586, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.051047, "step": 12890, "value_mse_loss_layer_000": 0.000475, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.238183, "epoch": 0.0129, "grad_norm": 0.0013479233020916581, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050858, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.051297, "step": 12900, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.021118, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000173, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000259, "vq_loss_layer_022": 0.000191, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.303512, "epoch": 0.01291, "grad_norm": 0.001309962011873722, "key_mse_loss_layer_000": 0.003998, "key_mse_loss_layer_001": 0.011353, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.098633, "key_mse_loss_layer_031": 0.09082, "kv_mse_loss": 0.050452, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.0509, "step": 12910, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001396, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014282, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013123, "value_mse_loss_layer_013": 0.014221, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.016479, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.016602, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.058105, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.05542, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000136, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000275, "vq_loss_layer_013": 0.0002, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000322, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000214, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000153, "vq_loss_layer_021": 0.000198, "vq_loss_layer_022": 0.000163, "vq_loss_layer_023": 0.000158, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000565, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.003082, "vq_loss_layer_031": 0.003754 }, { "ce_loss": 2.297653, "epoch": 0.01292, "grad_norm": 0.0012423719745129347, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050345, "kv_vq_loss": 0.000427, "learning_rate": 0.001, "loss": 0.050815, "step": 12920, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000441, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001633, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.339921, "epoch": 0.01293, "grad_norm": 0.0011316979071125388, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050253, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050705, "step": 12930, "value_mse_loss_layer_000": 0.000469, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.026245, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001648, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.316669, "epoch": 0.01294, "grad_norm": 0.0011930344626307487, "key_mse_loss_layer_000": 0.002701, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.072266, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050415, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.050882, "step": 12940, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.021484, "value_mse_loss_layer_023": 0.024048, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.026978, "value_mse_loss_layer_027": 0.034668, "value_mse_loss_layer_028": 0.039307, "value_mse_loss_layer_029": 0.046387, "value_mse_loss_layer_030": 0.052734, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000305, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000406, "vq_loss_layer_022": 0.000305, "vq_loss_layer_023": 0.000422, "vq_loss_layer_024": 0.000425, "vq_loss_layer_025": 0.000595, "vq_loss_layer_026": 0.000687, "vq_loss_layer_027": 0.000854, "vq_loss_layer_028": 0.001465, "vq_loss_layer_029": 0.002151, "vq_loss_layer_030": 0.002991, "vq_loss_layer_031": 0.006042 }, { "ce_loss": 2.261171, "epoch": 0.01295, "grad_norm": 0.0013650376349687576, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.050659, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.051111, "step": 12950, "value_mse_loss_layer_000": 0.000465, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.038086, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.35761, "epoch": 0.01296, "grad_norm": 0.0011237834114581347, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050626, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.051053, "step": 12960, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000161, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.324548, "epoch": 0.01297, "grad_norm": 0.0013439174508675933, "key_mse_loss_layer_000": 0.002472, "key_mse_loss_layer_001": 0.009705, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.083008, "key_mse_loss_layer_013": 0.140625, "key_mse_loss_layer_014": 0.137695, "key_mse_loss_layer_015": 0.126953, "key_mse_loss_layer_016": 0.117188, "key_mse_loss_layer_017": 0.118164, "key_mse_loss_layer_018": 0.121094, "key_mse_loss_layer_019": 0.100586, "key_mse_loss_layer_020": 0.117188, "key_mse_loss_layer_021": 0.108887, "key_mse_loss_layer_022": 0.112305, "key_mse_loss_layer_023": 0.112305, "key_mse_loss_layer_024": 0.084473, "key_mse_loss_layer_025": 0.081055, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.050568, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.051019, "step": 12970, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.008179, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.014587, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.02124, "value_mse_loss_layer_023": 0.02478, "value_mse_loss_layer_024": 0.026611, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.035889, "value_mse_loss_layer_028": 0.041016, "value_mse_loss_layer_029": 0.047607, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000243, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000343, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000446, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.000633, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.30295, "epoch": 0.01298, "grad_norm": 0.001291619730181992, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.050357, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.05079, "step": 12980, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007538, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.035645, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000357, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.003494 }, { "ce_loss": 2.313652, "epoch": 0.01299, "grad_norm": 0.0013577367644757032, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.05004, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050488, "step": 12990, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000198, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000221, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000437, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.003784 }, { "ce_loss": 2.3202, "epoch": 0.013, "grad_norm": 0.0011906480649486184, "key_mse_loss_layer_000": 0.003769, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050345, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.050793, "step": 13000, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.324089, "epoch": 0.01301, "grad_norm": 0.001324434531852603, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.113281, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050247, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050681, "step": 13010, "value_mse_loss_layer_000": 0.000462, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000334, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.302529, "epoch": 0.01302, "grad_norm": 0.0014695771969854832, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050418, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.050882, "step": 13020, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.230284, "epoch": 0.01303, "grad_norm": 0.0012066909112036228, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.088867, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050348, "kv_vq_loss": 0.000426, "learning_rate": 0.001, "loss": 0.050821, "step": 13030, "value_mse_loss_layer_000": 0.000465, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.001068, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.345786, "epoch": 0.01304, "grad_norm": 0.0009944788180291653, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.05022, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.05065, "step": 13040, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000324, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000208, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.276838, "epoch": 0.01305, "grad_norm": 0.0012416484532877803, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050357, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050806, "step": 13050, "value_mse_loss_layer_000": 0.000465, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000147, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000156, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000202, "vq_loss_layer_024": 0.000187, "vq_loss_layer_025": 0.000223, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000406, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.280894, "epoch": 0.01306, "grad_norm": 0.0012446314794942737, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.052002, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.065918, "key_mse_loss_layer_025": 0.064453, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.071777, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.075195, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.050366, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.050833, "step": 13060, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000186, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000153, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000744, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.263509, "epoch": 0.01307, "grad_norm": 0.0010589311132207513, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.116699, "key_mse_loss_layer_016": 0.108887, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.089844, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.05032, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.050778, "step": 13070, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.02417, "value_mse_loss_layer_024": 0.026367, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.028564, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.046631, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000267, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000334, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000347, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.289498, "epoch": 0.01308, "grad_norm": 0.0013004302745684981, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050574, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.051004, "step": 13080, "value_mse_loss_layer_000": 0.000465, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.329856, "epoch": 0.01309, "grad_norm": 0.0013961243676021695, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050418, "kv_vq_loss": 0.000435, "learning_rate": 0.001, "loss": 0.0509, "step": 13090, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.301834, "epoch": 0.0131, "grad_norm": 0.001068249111995101, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050366, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050806, "step": 13100, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000226, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000315, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000236, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.00015, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000824, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002609, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.272334, "epoch": 0.01311, "grad_norm": 0.0010923800291493535, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050339, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050763, "step": 13110, "value_mse_loss_layer_000": 0.000462, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000219, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.295191, "epoch": 0.01312, "grad_norm": 0.001468003261834383, "key_mse_loss_layer_000": 0.00264, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.075195, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.049979, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.050452, "step": 13120, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.017334, "value_mse_loss_layer_010": 0.013672, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.020508, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.048096, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000234, "vq_loss_layer_020": 0.000263, "vq_loss_layer_021": 0.000402, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000362, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000452, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.284163, "epoch": 0.01313, "grad_norm": 0.0012121142353862524, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.050745, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.051212, "step": 13130, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.017334, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.025391, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.032959, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.034912, "value_mse_loss_layer_027": 0.04541, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.059326, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.277006, "epoch": 0.01314, "grad_norm": 0.0010106994304805994, "key_mse_loss_layer_000": 0.003738, "key_mse_loss_layer_001": 0.011719, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.092773, "key_mse_loss_layer_027": 0.095215, "key_mse_loss_layer_028": 0.099121, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.098633, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.05032, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050757, "step": 13140, "value_mse_loss_layer_000": 0.000431, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.021484, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000211, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000246, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000221, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000259, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000196, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000418, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.000675, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.004303 }, { "ce_loss": 2.260897, "epoch": 0.01315, "grad_norm": 0.0012785953003913164, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.049911, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050372, "step": 13150, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02478, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.343964, "epoch": 0.01316, "grad_norm": 0.0011527675669640303, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050217, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050632, "step": 13160, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000231, "vq_loss_layer_026": 0.000353, "vq_loss_layer_027": 0.000399, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000679, "vq_loss_layer_030": 0.001511, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.342854, "epoch": 0.01317, "grad_norm": 0.0013708984479308128, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050168, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050629, "step": 13170, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.2925, "epoch": 0.01318, "grad_norm": 0.0010980508523061872, "key_mse_loss_layer_000": 0.003723, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.092773, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.050409, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050867, "step": 13180, "value_mse_loss_layer_000": 0.000462, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.004852, "value_mse_loss_layer_003": 0.008667, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.026489, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.056396, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 8.7e-05, "vq_loss_layer_007": 0.000135, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000372, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000336, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.000542, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003754 }, { "ce_loss": 2.297179, "epoch": 0.01319, "grad_norm": 0.0013983632670715451, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050504, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.050949, "step": 13190, "value_mse_loss_layer_000": 0.000467, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000204, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.25546, "epoch": 0.0132, "grad_norm": 0.0011847366113215685, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050494, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050949, "step": 13200, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000132, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.000214, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000224, "vq_loss_layer_026": 0.000355, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000591, "vq_loss_layer_029": 0.00071, "vq_loss_layer_030": 0.001923, "vq_loss_layer_031": 0.002426 }, { "ce_loss": 2.273545, "epoch": 0.01321, "grad_norm": 0.00117498857434839, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050589, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.051028, "step": 13210, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016846, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.020752, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.02063, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.000935, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.347387, "epoch": 0.01322, "grad_norm": 0.0014146955218166113, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.063477, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.050369, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050818, "step": 13220, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000177, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000136, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000179, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.324506, "epoch": 0.01323, "grad_norm": 0.0012376498198136687, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.05018, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.050656, "step": 13230, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.328344, "epoch": 0.01324, "grad_norm": 0.0011113026412203908, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.04993, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050385, "step": 13240, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001373, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000224, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000395, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.296479, "epoch": 0.01325, "grad_norm": 0.0011057419469580054, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050589, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.051041, "step": 13250, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007294, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000246, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000372, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.332008, "epoch": 0.01326, "grad_norm": 0.001319271046668291, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050351, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050803, "step": 13260, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000441, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.003235 }, { "ce_loss": 2.334541, "epoch": 0.01327, "grad_norm": 0.001174934790469706, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050192, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050635, "step": 13270, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.001343, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.270804, "epoch": 0.01328, "grad_norm": 0.001164957764558494, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.049915, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050354, "step": 13280, "value_mse_loss_layer_000": 0.000477, "value_mse_loss_layer_001": 0.001366, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.310362, "epoch": 0.01329, "grad_norm": 0.0014528849860653281, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049994, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050433, "step": 13290, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007111, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.014404, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000229, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000185, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.002823, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.307965, "epoch": 0.0133, "grad_norm": 0.0012964893830940127, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050473, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.050922, "step": 13300, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.012085, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.015259, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000195, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.003006 }, { "ce_loss": 2.291306, "epoch": 0.01331, "grad_norm": 0.001344930729828775, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050391, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050824, "step": 13310, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.060303, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.334284, "epoch": 0.01332, "grad_norm": 0.0012571075931191444, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050232, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050693, "step": 13320, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.02002, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000439, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.282376, "epoch": 0.01333, "grad_norm": 0.0014211962698027492, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.050452, "kv_vq_loss": 0.000413, "learning_rate": 0.001, "loss": 0.050909, "step": 13330, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.00024, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.349446, "epoch": 0.01334, "grad_norm": 0.0012523551704362035, "key_mse_loss_layer_000": 0.004059, "key_mse_loss_layer_001": 0.011108, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050284, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.050739, "step": 13340, "value_mse_loss_layer_000": 0.000471, "value_mse_loss_layer_001": 0.00135, "value_mse_loss_layer_002": 0.004822, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.00769, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.012451, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000194, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000399, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.345128, "epoch": 0.01335, "grad_norm": 0.0009255550103262067, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050046, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050458, "step": 13350, "value_mse_loss_layer_000": 0.000446, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.057617, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000254, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000237, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.280319, "epoch": 0.01336, "grad_norm": 0.0015478539280593395, "key_mse_loss_layer_000": 0.003754, "key_mse_loss_layer_001": 0.01178, "key_mse_loss_layer_002": 0.062988, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.074219, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.097168, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050403, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050836, "step": 13360, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001389, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.016724, "value_mse_loss_layer_016": 0.013672, "value_mse_loss_layer_017": 0.016235, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.065918, "value_mse_loss_layer_031": 0.058105, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000213, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000232, "vq_loss_layer_011": 0.000228, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000237, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000161, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000205, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000441, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000759, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002899, "vq_loss_layer_031": 0.005402 }, { "ce_loss": 2.318658, "epoch": 0.01337, "grad_norm": 0.001075469539500773, "key_mse_loss_layer_000": 0.00351, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050415, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.050888, "step": 13370, "value_mse_loss_layer_000": 0.000469, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009888, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019897, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000212, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000576, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.002487 }, { "ce_loss": 2.346011, "epoch": 0.01338, "grad_norm": 0.0014534215442836285, "key_mse_loss_layer_000": 0.003677, "key_mse_loss_layer_001": 0.011841, "key_mse_loss_layer_002": 0.063477, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.10791, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.111328, "key_mse_loss_layer_023": 0.11084, "key_mse_loss_layer_024": 0.092285, "key_mse_loss_layer_025": 0.088867, "key_mse_loss_layer_026": 0.102051, "key_mse_loss_layer_027": 0.11084, "key_mse_loss_layer_028": 0.111816, "key_mse_loss_layer_029": 0.105957, "key_mse_loss_layer_030": 0.108887, "key_mse_loss_layer_031": 0.083496, "kv_mse_loss": 0.050034, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.05047, "step": 13380, "value_mse_loss_layer_000": 0.000465, "value_mse_loss_layer_001": 0.001358, "value_mse_loss_layer_002": 0.004791, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.014221, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013123, "value_mse_loss_layer_013": 0.014038, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.016235, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.037354, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.054443, "value_mse_loss_layer_029": 0.064941, "value_mse_loss_layer_030": 0.072754, "value_mse_loss_layer_031": 0.058594, "vq_loss_layer_000": 7e-06, "vq_loss_layer_001": 1.5e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 4e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000135, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000163, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000157, "vq_loss_layer_012": 0.000278, "vq_loss_layer_013": 0.000203, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.00029, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000234, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000154, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000196, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000736, "vq_loss_layer_028": 0.001328, "vq_loss_layer_029": 0.001511, "vq_loss_layer_030": 0.003647, "vq_loss_layer_031": 0.004639 }, { "ce_loss": 2.304612, "epoch": 0.01339, "grad_norm": 0.001049458747729659, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050333, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050769, "step": 13390, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.029053, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.057861, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.00029, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000273, "vq_loss_layer_022": 0.000199, "vq_loss_layer_023": 0.000214, "vq_loss_layer_024": 0.000188, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000587, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.333135, "epoch": 0.0134, "grad_norm": 0.0011985994642600417, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050098, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050528, "step": 13400, "value_mse_loss_layer_000": 0.000452, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.343321, "epoch": 0.01341, "grad_norm": 0.0013619307428598404, "key_mse_loss_layer_000": 0.003616, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050577, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.051028, "step": 13410, "value_mse_loss_layer_000": 0.000452, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.0047, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000366, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001183, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.286157, "epoch": 0.01342, "grad_norm": 0.0010312838712707162, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.050052, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050476, "step": 13420, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009338, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.029175, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000481, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000778, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.325389, "epoch": 0.01343, "grad_norm": 0.0014173054369166493, "key_mse_loss_layer_000": 0.002701, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.094238, "key_mse_loss_layer_009": 0.100586, "key_mse_loss_layer_010": 0.112793, "key_mse_loss_layer_011": 0.109375, "key_mse_loss_layer_012": 0.083496, "key_mse_loss_layer_013": 0.148438, "key_mse_loss_layer_014": 0.143555, "key_mse_loss_layer_015": 0.12793, "key_mse_loss_layer_016": 0.122559, "key_mse_loss_layer_017": 0.121094, "key_mse_loss_layer_018": 0.126953, "key_mse_loss_layer_019": 0.099121, "key_mse_loss_layer_020": 0.115723, "key_mse_loss_layer_021": 0.109375, "key_mse_loss_layer_022": 0.11377, "key_mse_loss_layer_023": 0.111328, "key_mse_loss_layer_024": 0.086426, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.096191, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.101074, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.097168, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049857, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.05029, "step": 13430, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021484, "value_mse_loss_layer_022": 0.020752, "value_mse_loss_layer_023": 0.023682, "value_mse_loss_layer_024": 0.025879, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.027954, "value_mse_loss_layer_027": 0.035645, "value_mse_loss_layer_028": 0.040283, "value_mse_loss_layer_029": 0.046631, "value_mse_loss_layer_030": 0.051758, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000221, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000391, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000418, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.00022, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000243, "vq_loss_layer_021": 0.000368, "vq_loss_layer_022": 0.000313, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000553, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.003937 }, { "ce_loss": 2.32111, "epoch": 0.01344, "grad_norm": 0.0011684957426041365, "key_mse_loss_layer_000": 0.002609, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050226, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050659, "step": 13440, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.01355, "value_mse_loss_layer_011": 0.013977, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.00042, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000305, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000311, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000744, "vq_loss_layer_030": 0.001717, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.308102, "epoch": 0.01345, "grad_norm": 0.0010840297909453511, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.065918, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.050348, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050769, "step": 13450, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007263, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000232, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.349524, "epoch": 0.01346, "grad_norm": 0.0013801859458908439, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050397, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050864, "step": 13460, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.34469, "epoch": 0.01347, "grad_norm": 0.0012865100288763642, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.084961, "kv_mse_loss": 0.050146, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050598, "step": 13470, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.00132, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000273, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.000217, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.00135, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.003601 }, { "ce_loss": 2.282784, "epoch": 0.01348, "grad_norm": 0.001136183156631887, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050238, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050681, "step": 13480, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004761, "value_mse_loss_layer_003": 0.008606, "value_mse_loss_layer_004": 0.007935, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003891 }, { "ce_loss": 2.323214, "epoch": 0.01349, "grad_norm": 0.001158767263405025, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.061279, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050006, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050449, "step": 13490, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000161, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000189, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000193, "vq_loss_layer_025": 0.000224, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000534, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.002304 }, { "ce_loss": 2.321171, "epoch": 0.0135, "grad_norm": 0.0012214711168780923, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.12207, "key_mse_loss_layer_015": 0.111816, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.050165, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050601, "step": 13500, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.0002, "vq_loss_layer_009": 0.000248, "vq_loss_layer_010": 0.000248, "vq_loss_layer_011": 0.000246, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.00037, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.000515, "vq_loss_layer_016": 0.000404, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000305, "vq_loss_layer_021": 0.000444, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000399, "vq_loss_layer_025": 0.000469, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000702, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.001099, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.31993, "epoch": 0.01351, "grad_norm": 0.0014772856375202537, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049905, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050327, "step": 13510, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.305664, "epoch": 0.01352, "grad_norm": 0.0013563520042225718, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.049982, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.050446, "step": 13520, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000954, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.311882, "epoch": 0.01353, "grad_norm": 0.0010557210771366954, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049939, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050372, "step": 13530, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000224, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.286273, "epoch": 0.01354, "grad_norm": 0.0012785305734723806, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.060547, "kv_mse_loss": 0.050128, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.05058, "step": 13540, "value_mse_loss_layer_000": 0.000463, "value_mse_loss_layer_001": 0.001312, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000416, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.00021, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.322973, "epoch": 0.01355, "grad_norm": 0.0013003875501453876, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.061279, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.104004, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.108398, "key_mse_loss_layer_021": 0.100586, "key_mse_loss_layer_022": 0.106445, "key_mse_loss_layer_023": 0.108398, "key_mse_loss_layer_024": 0.087891, "key_mse_loss_layer_025": 0.083008, "key_mse_loss_layer_026": 0.097656, "key_mse_loss_layer_027": 0.098633, "key_mse_loss_layer_028": 0.103516, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.101074, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050385, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.050842, "step": 13550, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004669, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013245, "value_mse_loss_layer_013": 0.014343, "value_mse_loss_layer_014": 0.015076, "value_mse_loss_layer_015": 0.016479, "value_mse_loss_layer_016": 0.01355, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.054688, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000223, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000237, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.244922, "epoch": 0.01356, "grad_norm": 0.0012570966500788927, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050452, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050894, "step": 13560, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.327586, "epoch": 0.01357, "grad_norm": 0.0011365379905328155, "key_mse_loss_layer_000": 0.002701, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049683, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050098, "step": 13570, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000725, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.307825, "epoch": 0.01358, "grad_norm": 0.0013280917191877961, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049817, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050214, "step": 13580, "value_mse_loss_layer_000": 0.000473, "value_mse_loss_layer_001": 0.001335, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.009705, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000163, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000233, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000311, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000679, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.313744, "epoch": 0.01359, "grad_norm": 0.0012967521324753761, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.061523, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.050113, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050571, "step": 13590, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001289, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 2e-06, "vq_loss_layer_003": 1.1e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001732, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.284195, "epoch": 0.0136, "grad_norm": 0.0011905707651749253, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049811, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050244, "step": 13600, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001923, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.249095, "epoch": 0.01361, "grad_norm": 0.0011384418467059731, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.050702, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.051154, "step": 13610, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013184, "value_mse_loss_layer_013": 0.014526, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.013733, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000284, "vq_loss_layer_013": 0.00022, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000286, "vq_loss_layer_016": 0.000271, "vq_loss_layer_017": 0.000228, "vq_loss_layer_018": 0.000154, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000159, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000195, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000702, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.293417, "epoch": 0.01362, "grad_norm": 0.0012176791206002235, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.061768, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.050226, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050647, "step": 13620, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000599, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.318803, "epoch": 0.01363, "grad_norm": 0.0013250478077679873, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050302, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.05076, "step": 13630, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001289, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.343526, "epoch": 0.01364, "grad_norm": 0.0013128010323271155, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049966, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050421, "step": 13640, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.326153, "epoch": 0.01365, "grad_norm": 0.0014094742946326733, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050531, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050986, "step": 13650, "value_mse_loss_layer_000": 0.000452, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.293567, "epoch": 0.01366, "grad_norm": 0.0012053627287968993, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.049994, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050439, "step": 13660, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.342316, "epoch": 0.01367, "grad_norm": 0.001077453838661313, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049857, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050293, "step": 13670, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.000431, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.340104, "epoch": 0.01368, "grad_norm": 0.0012631842400878668, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050082, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050516, "step": 13680, "value_mse_loss_layer_000": 0.000452, "value_mse_loss_layer_001": 0.001289, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.326771, "epoch": 0.01369, "grad_norm": 0.0012789050815626979, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.055664, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049658, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050098, "step": 13690, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001289, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.2e-05, "vq_loss_layer_007": 0.000137, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000177, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000171, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000235, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000334, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.286118, "epoch": 0.0137, "grad_norm": 0.0012368489988148212, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050497, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050943, "step": 13700, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.238262, "epoch": 0.01371, "grad_norm": 0.0013907154789194465, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050638, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.051108, "step": 13710, "value_mse_loss_layer_000": 0.000458, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.020508, "value_mse_loss_layer_020": 0.022583, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.030151, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.046631, "value_mse_loss_layer_028": 0.053467, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.067383, "value_mse_loss_layer_031": 0.056641, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000148, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.325483, "epoch": 0.01372, "grad_norm": 0.0010539989452809095, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.053711, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050165, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050613, "step": 13720, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000135, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000299, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.000217, "vq_loss_layer_024": 0.000197, "vq_loss_layer_025": 0.000214, "vq_loss_layer_026": 0.000347, "vq_loss_layer_027": 0.000401, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000683, "vq_loss_layer_030": 0.001358, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.310423, "epoch": 0.01373, "grad_norm": 0.0012579277390614152, "key_mse_loss_layer_000": 0.005035, "key_mse_loss_layer_001": 0.013428, "key_mse_loss_layer_002": 0.066895, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.070312, "key_mse_loss_layer_006": 0.084473, "key_mse_loss_layer_007": 0.085938, "key_mse_loss_layer_008": 0.097656, "key_mse_loss_layer_009": 0.103516, "key_mse_loss_layer_010": 0.118652, "key_mse_loss_layer_011": 0.115234, "key_mse_loss_layer_012": 0.084961, "key_mse_loss_layer_013": 0.140625, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.130859, "key_mse_loss_layer_016": 0.122559, "key_mse_loss_layer_017": 0.121094, "key_mse_loss_layer_018": 0.12793, "key_mse_loss_layer_019": 0.108887, "key_mse_loss_layer_020": 0.122559, "key_mse_loss_layer_021": 0.11377, "key_mse_loss_layer_022": 0.114258, "key_mse_loss_layer_023": 0.109375, "key_mse_loss_layer_024": 0.093262, "key_mse_loss_layer_025": 0.083008, "key_mse_loss_layer_026": 0.105957, "key_mse_loss_layer_027": 0.107422, "key_mse_loss_layer_028": 0.10791, "key_mse_loss_layer_029": 0.101562, "key_mse_loss_layer_030": 0.120605, "key_mse_loss_layer_031": 0.088867, "kv_mse_loss": 0.050369, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050809, "step": 13730, "value_mse_loss_layer_000": 0.000492, "value_mse_loss_layer_001": 0.001381, "value_mse_loss_layer_002": 0.004883, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.014282, "value_mse_loss_layer_012": 0.015137, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 2.1e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 7e-05, "vq_loss_layer_005": 9.1e-05, "vq_loss_layer_006": 0.000131, "vq_loss_layer_007": 0.000199, "vq_loss_layer_008": 0.000219, "vq_loss_layer_009": 0.000277, "vq_loss_layer_010": 0.000273, "vq_loss_layer_011": 0.000275, "vq_loss_layer_012": 0.000402, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000507, "vq_loss_layer_015": 0.000469, "vq_loss_layer_016": 0.000454, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000261, "vq_loss_layer_020": 0.000288, "vq_loss_layer_021": 0.000408, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000406, "vq_loss_layer_025": 0.000648, "vq_loss_layer_026": 0.000809, "vq_loss_layer_027": 0.001022, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001694, "vq_loss_layer_030": 0.003616, "vq_loss_layer_031": 0.005157 }, { "ce_loss": 2.295251, "epoch": 0.01374, "grad_norm": 0.0011653327383100986, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050369, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050806, "step": 13740, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000254, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.34582, "epoch": 0.01375, "grad_norm": 0.0012916885316371918, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.11084, "key_mse_loss_layer_017": 0.111816, "key_mse_loss_layer_018": 0.120117, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.111328, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.108398, "key_mse_loss_layer_024": 0.084473, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.09375, "key_mse_loss_layer_027": 0.090332, "key_mse_loss_layer_028": 0.100098, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050305, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050751, "step": 13750, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.008118, "value_mse_loss_layer_005": 0.007446, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003433 }, { "ce_loss": 2.326183, "epoch": 0.01376, "grad_norm": 0.0011203078320249915, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.050177, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050604, "step": 13760, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000135, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000273, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.0002, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000603, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.306816, "epoch": 0.01377, "grad_norm": 0.0012890036450698972, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.12207, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.104004, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050217, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050641, "step": 13770, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000317, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000179, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000401, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.285575, "epoch": 0.01378, "grad_norm": 0.0012033210368826985, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050635, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.051093, "step": 13780, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000212, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000385, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.297792, "epoch": 0.01379, "grad_norm": 0.0010656663216650486, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009583, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.072266, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.05022, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050662, "step": 13790, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000359, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.001167, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.312563, "epoch": 0.0138, "grad_norm": 0.0011613917304202914, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050558, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050986, "step": 13800, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.320007, "epoch": 0.01381, "grad_norm": 0.0010650171898305416, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.049921, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050363, "step": 13810, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003784 }, { "ce_loss": 2.285413, "epoch": 0.01382, "grad_norm": 0.0011955061927437782, "key_mse_loss_layer_000": 0.002701, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050211, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.05065, "step": 13820, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000429, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.00024, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000656, "vq_loss_layer_030": 0.001472, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.306229, "epoch": 0.01383, "grad_norm": 0.0012564653297886252, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049869, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050323, "step": 13830, "value_mse_loss_layer_000": 0.000446, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000137, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000213, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000408, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.35825, "epoch": 0.01384, "grad_norm": 0.0010853614658117294, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049457, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.04986, "step": 13840, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.299271, "epoch": 0.01385, "grad_norm": 0.0011304928921163082, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049982, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050409, "step": 13850, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001305, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.00885, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000136, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000273, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000851, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.341296, "epoch": 0.01386, "grad_norm": 0.0010816633002832532, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.054443, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050308, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050745, "step": 13860, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.347045, "epoch": 0.01387, "grad_norm": 0.0011024315608665347, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050146, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.05058, "step": 13870, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000431, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.310625, "epoch": 0.01388, "grad_norm": 0.0010807206854224205, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050159, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050601, "step": 13880, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000178, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000152, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000391, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001518, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.281839, "epoch": 0.01389, "grad_norm": 0.0011428255820646882, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.05011, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050555, "step": 13890, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.275137, "epoch": 0.0139, "grad_norm": 0.0014075050130486488, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.041748, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.099609, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.081055, "key_mse_loss_layer_013": 0.139648, "key_mse_loss_layer_014": 0.137695, "key_mse_loss_layer_015": 0.123047, "key_mse_loss_layer_016": 0.116699, "key_mse_loss_layer_017": 0.115234, "key_mse_loss_layer_018": 0.120117, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.11084, "key_mse_loss_layer_021": 0.106934, "key_mse_loss_layer_022": 0.111328, "key_mse_loss_layer_023": 0.106934, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050458, "kv_vq_loss": 0.00042, "learning_rate": 0.001, "loss": 0.050919, "step": 13900, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017456, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.021362, "value_mse_loss_layer_023": 0.023193, "value_mse_loss_layer_024": 0.025513, "value_mse_loss_layer_025": 0.031494, "value_mse_loss_layer_026": 0.027466, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041016, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.00018, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000241, "vq_loss_layer_010": 0.000243, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000414, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000376, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000229, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000278, "vq_loss_layer_021": 0.000441, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000418, "vq_loss_layer_024": 0.000389, "vq_loss_layer_025": 0.000576, "vq_loss_layer_026": 0.000652, "vq_loss_layer_027": 0.000732, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.003052, "vq_loss_layer_031": 0.003647 }, { "ce_loss": 2.361677, "epoch": 0.01391, "grad_norm": 0.0011351397261023521, "key_mse_loss_layer_000": 0.002731, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.050153, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.05061, "step": 13910, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015564, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.331685, "epoch": 0.01392, "grad_norm": 0.0013048005057498813, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050128, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050574, "step": 13920, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004608, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000402, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.331339, "epoch": 0.01393, "grad_norm": 0.001130910124629736, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.09668, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.117676, "key_mse_loss_layer_016": 0.11084, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.105957, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.091309, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050003, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.05043, "step": 13930, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001289, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.01001, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.021973, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.00025, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.00023, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000406, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000368, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.000486, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.003357, "vq_loss_layer_031": 0.003799 }, { "ce_loss": 2.301683, "epoch": 0.01394, "grad_norm": 0.001193474163301289, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050296, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050732, "step": 13940, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.280814, "epoch": 0.01395, "grad_norm": 0.0011455578496679664, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.05004, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050507, "step": 13950, "value_mse_loss_layer_000": 0.000452, "value_mse_loss_layer_001": 0.001297, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021729, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000263, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000207, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000973, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.317027, "epoch": 0.01396, "grad_norm": 0.0013231047196313739, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.12207, "key_mse_loss_layer_015": 0.11084, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050015, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050461, "step": 13960, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007507, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000228, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.302872, "epoch": 0.01397, "grad_norm": 0.001109316828660667, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050574, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.051007, "step": 13970, "value_mse_loss_layer_000": 0.000456, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.028564, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000153, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000391, "vq_loss_layer_014": 0.000303, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000214, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000233, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000572, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.337034, "epoch": 0.01398, "grad_norm": 0.0013145923148840666, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049942, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050366, "step": 13980, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013245, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019775, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000123, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.33607, "epoch": 0.01399, "grad_norm": 0.0012162826023995876, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050003, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050452, "step": 13990, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.301694, "epoch": 0.014, "grad_norm": 0.0013778244610875845, "key_mse_loss_layer_000": 0.002365, "key_mse_loss_layer_001": 0.009583, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.044189, "key_mse_loss_layer_004": 0.041504, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.139648, "key_mse_loss_layer_014": 0.134766, "key_mse_loss_layer_015": 0.121094, "key_mse_loss_layer_016": 0.114258, "key_mse_loss_layer_017": 0.116211, "key_mse_loss_layer_018": 0.119141, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.109863, "key_mse_loss_layer_021": 0.104004, "key_mse_loss_layer_022": 0.106934, "key_mse_loss_layer_023": 0.104492, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.089844, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.061035, "kv_mse_loss": 0.050107, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050534, "step": 14000, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.013611, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.014648, "value_mse_loss_layer_019": 0.017456, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.020508, "value_mse_loss_layer_023": 0.023804, "value_mse_loss_layer_024": 0.025513, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.02832, "value_mse_loss_layer_027": 0.035889, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.045166, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6.4e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000172, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.00021, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000374, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000228, "vq_loss_layer_021": 0.000414, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000437, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.000671, "vq_loss_layer_030": 0.003067, "vq_loss_layer_031": 0.003677 }, { "ce_loss": 2.250748, "epoch": 0.01401, "grad_norm": 0.001192178693599999, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050433, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050903, "step": 14010, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.301646, "epoch": 0.01402, "grad_norm": 0.0011874038027599454, "key_mse_loss_layer_000": 0.004669, "key_mse_loss_layer_001": 0.012024, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.086426, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.094727, "key_mse_loss_layer_027": 0.098633, "key_mse_loss_layer_028": 0.099121, "key_mse_loss_layer_029": 0.097656, "key_mse_loss_layer_030": 0.105469, "key_mse_loss_layer_031": 0.080078, "kv_mse_loss": 0.050458, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050894, "step": 14020, "value_mse_loss_layer_000": 0.000465, "value_mse_loss_layer_001": 0.001328, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.014343, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.016846, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.0354, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.05957, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.05542, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000254, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000161, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000187, "vq_loss_layer_023": 0.000196, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.002792, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.3362, "epoch": 0.01403, "grad_norm": 0.0014713204000145197, "key_mse_loss_layer_000": 0.002533, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.060303, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.124023, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.050223, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050677, "step": 14030, "value_mse_loss_layer_000": 0.000431, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.00473, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007996, "value_mse_loss_layer_005": 0.007385, "value_mse_loss_layer_006": 0.009399, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.014221, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.021606, "value_mse_loss_layer_023": 0.022705, "value_mse_loss_layer_024": 0.025635, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.028198, "value_mse_loss_layer_027": 0.035645, "value_mse_loss_layer_028": 0.039062, "value_mse_loss_layer_029": 0.045654, "value_mse_loss_layer_030": 0.051514, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.4e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 8e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000188, "vq_loss_layer_009": 0.000239, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000239, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000488, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000267, "vq_loss_layer_021": 0.000456, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000553, "vq_loss_layer_026": 0.000717, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.003113, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.331019, "epoch": 0.01404, "grad_norm": 0.0011081791017204523, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.0496, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050034, "step": 14040, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.334302, "epoch": 0.01405, "grad_norm": 0.0012046457268297672, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.050171, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050595, "step": 14050, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.029541, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.001152, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.311539, "epoch": 0.01406, "grad_norm": 0.0010604806011542678, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050073, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.05054, "step": 14060, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000129, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.00025, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.314469, "epoch": 0.01407, "grad_norm": 0.001126044662669301, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049838, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050272, "step": 14070, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001419, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.343085, "epoch": 0.01408, "grad_norm": 0.001097455620765686, "key_mse_loss_layer_000": 0.002762, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.050259, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050705, "step": 14080, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000526, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000376, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.345331, "epoch": 0.01409, "grad_norm": 0.001212304923683405, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050009, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050458, "step": 14090, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009155, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011963, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000431, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.36298, "epoch": 0.0141, "grad_norm": 0.0011705561773851514, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.053711, "key_mse_loss_layer_004": 0.059814, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049759, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050198, "step": 14100, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000137, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000194, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000736, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.30827, "epoch": 0.01411, "grad_norm": 0.0012544760247692466, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050348, "kv_vq_loss": 0.000425, "learning_rate": 0.001, "loss": 0.050809, "step": 14110, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001282, "value_mse_loss_layer_002": 0.004639, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.00824, "value_mse_loss_layer_005": 0.007751, "value_mse_loss_layer_006": 0.009277, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.7e-05, "vq_loss_layer_006": 0.000128, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000357, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000422, "vq_loss_layer_022": 0.000336, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000422, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000675, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.004028 }, { "ce_loss": 2.289435, "epoch": 0.01412, "grad_norm": 0.001041410374455154, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050003, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050415, "step": 14120, "value_mse_loss_layer_000": 0.000446, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.31655, "epoch": 0.01413, "grad_norm": 0.0010824573691934347, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049786, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050186, "step": 14130, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013733, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.28534, "epoch": 0.01414, "grad_norm": 0.0013368106447160244, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.050516, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.05098, "step": 14140, "value_mse_loss_layer_000": 0.000446, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000366, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000448, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.00041, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000336, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000395, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.001114, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.291703, "epoch": 0.01415, "grad_norm": 0.0011305701918900013, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050079, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050525, "step": 14150, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.007324, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011902, "value_mse_loss_layer_009": 0.016602, "value_mse_loss_layer_010": 0.013367, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000322, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.290464, "epoch": 0.01416, "grad_norm": 0.0011600019643083215, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.05018, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050613, "step": 14160, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000288, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.261685, "epoch": 0.01417, "grad_norm": 0.0011501051485538483, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050531, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050989, "step": 14170, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.012024, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015564, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.339057, "epoch": 0.01418, "grad_norm": 0.0011609046487137675, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050333, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050732, "step": 14180, "value_mse_loss_layer_000": 0.000446, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000193, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000254, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000156, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000603, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.285001, "epoch": 0.01419, "grad_norm": 0.0013719385024160147, "key_mse_loss_layer_000": 0.003738, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050284, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.05072, "step": 14190, "value_mse_loss_layer_000": 0.000446, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004547, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.01709, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.063965, "value_mse_loss_layer_031": 0.053467, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000194, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.003174, "vq_loss_layer_031": 0.004456 }, { "ce_loss": 2.289254, "epoch": 0.0142, "grad_norm": 0.001080309972167015, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049954, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050406, "step": 14200, "value_mse_loss_layer_000": 0.000454, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000139, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.00164, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.295452, "epoch": 0.01421, "grad_norm": 0.0011635697446763515, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050275, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050656, "step": 14210, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.001846, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.252003, "epoch": 0.01422, "grad_norm": 0.001116838539019227, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.130859, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.108887, "key_mse_loss_layer_017": 0.110352, "key_mse_loss_layer_018": 0.118164, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.103027, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.089844, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050064, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050482, "step": 14220, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000324, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.337961, "epoch": 0.01423, "grad_norm": 0.001151518546976149, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.053955, "key_mse_loss_layer_004": 0.060547, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050037, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050476, "step": 14230, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000207, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000229, "vq_loss_layer_026": 0.000359, "vq_loss_layer_027": 0.000406, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000706, "vq_loss_layer_030": 0.001511, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.346051, "epoch": 0.01424, "grad_norm": 0.0010674860095605254, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.04996, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050369, "step": 14240, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.312049, "epoch": 0.01425, "grad_norm": 0.0012691926676779985, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.049933, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050375, "step": 14250, "value_mse_loss_layer_000": 0.00045, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007141, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.013916, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 7.8e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000184, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.00036, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000538, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000664, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002609, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.297818, "epoch": 0.01426, "grad_norm": 0.0011199916480109096, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.050189, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050647, "step": 14260, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.000147, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000242, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000155, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.317516, "epoch": 0.01427, "grad_norm": 0.0010505743557587266, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050101, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050531, "step": 14270, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000168, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.00016, "vq_loss_layer_019": 0.000128, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.00021, "vq_loss_layer_024": 0.000197, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001625, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.295039, "epoch": 0.01428, "grad_norm": 0.0011242792243137956, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050323, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050748, "step": 14280, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.000209, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000224, "vq_loss_layer_026": 0.000362, "vq_loss_layer_027": 0.000385, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.362862, "epoch": 0.01429, "grad_norm": 0.0010863314382731915, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049667, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050104, "step": 14290, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001717, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.298551, "epoch": 0.0143, "grad_norm": 0.0012680793879553676, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.050253, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050671, "step": 14300, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001274, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011841, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.019775, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.025146, "value_mse_loss_layer_022": 0.025757, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000216, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000469, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.00041, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001534, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.004059 }, { "ce_loss": 2.264917, "epoch": 0.01431, "grad_norm": 0.0012968879891559482, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050558, "kv_vq_loss": 0.000423, "learning_rate": 0.001, "loss": 0.051016, "step": 14310, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.303968, "epoch": 0.01432, "grad_norm": 0.0013311783550307155, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050421, "kv_vq_loss": 0.000419, "learning_rate": 0.001, "loss": 0.050879, "step": 14320, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000427, "vq_loss_layer_016": 0.000378, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000263, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.331694, "epoch": 0.01433, "grad_norm": 0.0010229547042399645, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049838, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050266, "step": 14330, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.034668, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.049561, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.052002, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000288, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000839, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.265342, "epoch": 0.01434, "grad_norm": 0.0013505714014172554, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050592, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.051035, "step": 14340, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.339113, "epoch": 0.01435, "grad_norm": 0.001225872547365725, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.060303, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050143, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050595, "step": 14350, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.054443, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000465, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.001213, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.004883 }, { "ce_loss": 2.295781, "epoch": 0.01436, "grad_norm": 0.0011659790761768818, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.05014, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050555, "step": 14360, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000178, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000219, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.270607, "epoch": 0.01437, "grad_norm": 0.0014457367360591888, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.062256, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.09668, "key_mse_loss_layer_028": 0.101074, "key_mse_loss_layer_029": 0.095215, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.050385, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050806, "step": 14370, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001266, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.039307, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.053711, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000135, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000278, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000248, "vq_loss_layer_019": 0.000185, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000448, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.001366, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.005005 }, { "ce_loss": 2.283901, "epoch": 0.01438, "grad_norm": 0.001038793008774519, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.074219, "key_mse_loss_layer_031": 0.059814, "kv_mse_loss": 0.050262, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050699, "step": 14380, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000374, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.003616 }, { "ce_loss": 2.269098, "epoch": 0.01439, "grad_norm": 0.0011586660984903574, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050247, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050677, "step": 14390, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000179, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000307, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000208, "vq_loss_layer_024": 0.000173, "vq_loss_layer_025": 0.000232, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000546, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.284801, "epoch": 0.0144, "grad_norm": 0.001127814408391714, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050546, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.051004, "step": 14400, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000261, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.00058, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.319113, "epoch": 0.01441, "grad_norm": 0.0011378886410966516, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050235, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050665, "step": 14410, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000241, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.325707, "epoch": 0.01442, "grad_norm": 0.0012157775927335024, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.050217, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050638, "step": 14420, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.013062, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000961, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.310709, "epoch": 0.01443, "grad_norm": 0.001238141325302422, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050327, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050772, "step": 14430, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.324783, "epoch": 0.01444, "grad_norm": 0.0011227803770452738, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050244, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050659, "step": 14440, "value_mse_loss_layer_000": 0.000448, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000622, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.37419, "epoch": 0.01445, "grad_norm": 0.0013793130638077855, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.099121, "key_mse_loss_layer_010": 0.112793, "key_mse_loss_layer_011": 0.106934, "key_mse_loss_layer_012": 0.083008, "key_mse_loss_layer_013": 0.150391, "key_mse_loss_layer_014": 0.145508, "key_mse_loss_layer_015": 0.130859, "key_mse_loss_layer_016": 0.126953, "key_mse_loss_layer_017": 0.125977, "key_mse_loss_layer_018": 0.132812, "key_mse_loss_layer_019": 0.102051, "key_mse_loss_layer_020": 0.119141, "key_mse_loss_layer_021": 0.112305, "key_mse_loss_layer_022": 0.119629, "key_mse_loss_layer_023": 0.116699, "key_mse_loss_layer_024": 0.091797, "key_mse_loss_layer_025": 0.084473, "key_mse_loss_layer_026": 0.101074, "key_mse_loss_layer_027": 0.097168, "key_mse_loss_layer_028": 0.105469, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.10498, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050101, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050534, "step": 14450, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.023926, "value_mse_loss_layer_024": 0.02771, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.028564, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000207, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000236, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000378, "vq_loss_layer_013": 0.000353, "vq_loss_layer_014": 0.000481, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000404, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000347, "vq_loss_layer_024": 0.000443, "vq_loss_layer_025": 0.000572, "vq_loss_layer_026": 0.000584, "vq_loss_layer_027": 0.000839, "vq_loss_layer_028": 0.001175, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.00264, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.30597, "epoch": 0.01446, "grad_norm": 0.0010761350858956575, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.108398, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.061035, "kv_mse_loss": 0.050031, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050424, "step": 14460, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.008301, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021118, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.023315, "value_mse_loss_layer_024": 0.026123, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.027832, "value_mse_loss_layer_027": 0.035645, "value_mse_loss_layer_028": 0.040039, "value_mse_loss_layer_029": 0.044922, "value_mse_loss_layer_030": 0.051514, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 4.1e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000213, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000404, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000322, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.000431, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.004028 }, { "ce_loss": 2.285967, "epoch": 0.01447, "grad_norm": 0.0013980549992993474, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050165, "kv_vq_loss": 0.000418, "learning_rate": 0.001, "loss": 0.050626, "step": 14470, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.003586 }, { "ce_loss": 2.327198, "epoch": 0.01448, "grad_norm": 0.001052055275067687, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050421, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050845, "step": 14480, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.283856, "epoch": 0.01449, "grad_norm": 0.0010749328648671508, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049768, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050186, "step": 14490, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000181, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.001534, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.319384, "epoch": 0.0145, "grad_norm": 0.001092612510547042, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049884, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050311, "step": 14500, "value_mse_loss_layer_000": 0.000446, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.029175, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000402, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.00069, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.290064, "epoch": 0.01451, "grad_norm": 0.0012599483598023653, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050397, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050842, "step": 14510, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.333369, "epoch": 0.01452, "grad_norm": 0.0012131895637139678, "key_mse_loss_layer_000": 0.003632, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.099121, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.05, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050436, "step": 14520, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.295988, "epoch": 0.01453, "grad_norm": 0.0012228748528286815, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050604, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.051025, "step": 14530, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000237, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.288485, "epoch": 0.01454, "grad_norm": 0.0013300830032676458, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.113281, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.101562, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.089844, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050488, "kv_vq_loss": 0.000429, "learning_rate": 0.001, "loss": 0.050955, "step": 14540, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000238, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.000923, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.311963, "epoch": 0.01455, "grad_norm": 0.0010983060346916318, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.060059, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050696, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.051126, "step": 14550, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.017822, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000162, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000202, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000223, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002319 }, { "ce_loss": 2.301048, "epoch": 0.01456, "grad_norm": 0.0011445395648479462, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050354, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050778, "step": 14560, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.32929, "epoch": 0.01457, "grad_norm": 0.0011572933290153742, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.114746, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.115723, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.10791, "key_mse_loss_layer_021": 0.102051, "key_mse_loss_layer_022": 0.106934, "key_mse_loss_layer_023": 0.105957, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.082031, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.094238, "key_mse_loss_layer_028": 0.100098, "key_mse_loss_layer_029": 0.092285, "key_mse_loss_layer_030": 0.097168, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050235, "kv_vq_loss": 0.000433, "learning_rate": 0.001, "loss": 0.050705, "step": 14570, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001244, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.358852, "epoch": 0.01458, "grad_norm": 0.0010596828069537878, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050052, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050473, "step": 14580, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000675, "vq_loss_layer_030": 0.001534, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.300124, "epoch": 0.01459, "grad_norm": 0.0012307828292250633, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.060547, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.050345, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050769, "step": 14590, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.024414, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000131, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000185, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000603, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.302431, "epoch": 0.0146, "grad_norm": 0.0012199438642710447, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.049673, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050079, "step": 14600, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000219, "vq_loss_layer_020": 0.000435, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000668, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.326248, "epoch": 0.01461, "grad_norm": 0.0012035358231514692, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.104004, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.083008, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.092773, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.092285, "key_mse_loss_layer_030": 0.098145, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.049982, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050406, "step": 14610, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013733, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.00015, "vq_loss_layer_011": 0.000162, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000235, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000305, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.000229, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000136, "vq_loss_layer_020": 0.000149, "vq_loss_layer_021": 0.000267, "vq_loss_layer_022": 0.000187, "vq_loss_layer_023": 0.000204, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.29762, "epoch": 0.01462, "grad_norm": 0.0010777557035908103, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050259, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050665, "step": 14620, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000135, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000243, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000199, "vq_loss_layer_025": 0.000221, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000397, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.333709, "epoch": 0.01463, "grad_norm": 0.0011526963207870722, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049966, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050403, "step": 14630, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.323038, "epoch": 0.01464, "grad_norm": 0.0011658803559839725, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.049765, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050192, "step": 14640, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.000204, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001526, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.298257, "epoch": 0.01465, "grad_norm": 0.0011177539126947522, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.05542, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.070801, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.092773, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.088867, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.084961, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.049841, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050275, "step": 14650, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000173, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000961, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.004059 }, { "ce_loss": 2.285428, "epoch": 0.01466, "grad_norm": 0.0011011555325239897, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.083008, "kv_mse_loss": 0.049719, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050128, "step": 14660, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001251, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000328, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.27843, "epoch": 0.01467, "grad_norm": 0.0011583165032789111, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.061035, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049976, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050427, "step": 14670, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.326198, "epoch": 0.01468, "grad_norm": 0.0010324111208319664, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.050021, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050464, "step": 14680, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.018921, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.00025, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000336, "vq_loss_layer_025": 0.000376, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.003586 }, { "ce_loss": 2.291221, "epoch": 0.01469, "grad_norm": 0.0011768057011067867, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049878, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050302, "step": 14690, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.00041, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000683, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.31012, "epoch": 0.0147, "grad_norm": 0.0010555466869845986, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049963, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050397, "step": 14700, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.00019, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.306808, "epoch": 0.01471, "grad_norm": 0.0010872791754081845, "key_mse_loss_layer_000": 0.003769, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050357, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.05079, "step": 14710, "value_mse_loss_layer_000": 0.000441, "value_mse_loss_layer_001": 0.001259, "value_mse_loss_layer_002": 0.004578, "value_mse_loss_layer_003": 0.008484, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000383, "vq_loss_layer_017": 0.000374, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000254, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.00028, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000303, "vq_loss_layer_025": 0.000399, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.306977, "epoch": 0.01472, "grad_norm": 0.0011512584751471877, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049954, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050409, "step": 14720, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000446, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.003586 }, { "ce_loss": 2.282318, "epoch": 0.01473, "grad_norm": 0.0011524776928126812, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050217, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050635, "step": 14730, "value_mse_loss_layer_000": 0.000444, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.319104, "epoch": 0.01474, "grad_norm": 0.001212882692925632, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050272, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050708, "step": 14740, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.000153, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.311844, "epoch": 0.01475, "grad_norm": 0.0013044262304902077, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050143, "kv_vq_loss": 0.000417, "learning_rate": 0.001, "loss": 0.050601, "step": 14750, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000238, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000273, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000145, "vq_loss_layer_021": 0.000261, "vq_loss_layer_022": 0.000183, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000214, "vq_loss_layer_025": 0.000235, "vq_loss_layer_026": 0.000366, "vq_loss_layer_027": 0.000391, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001534, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.29784, "epoch": 0.01476, "grad_norm": 0.001223945408128202, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049881, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050305, "step": 14760, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015625, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.02002, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.024658, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.315621, "epoch": 0.01477, "grad_norm": 0.0012482820311561227, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050198, "kv_vq_loss": 0.000424, "learning_rate": 0.001, "loss": 0.050656, "step": 14770, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.368332, "epoch": 0.01478, "grad_norm": 0.0009265710832551122, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.049796, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050204, "step": 14780, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000175, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000315, "vq_loss_layer_016": 0.000284, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000257, "vq_loss_layer_022": 0.000195, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000194, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000341, "vq_loss_layer_027": 0.000401, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.001419, "vq_loss_layer_031": 0.002441 }, { "ce_loss": 2.343112, "epoch": 0.01479, "grad_norm": 0.001298537477850914, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.103027, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.111328, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050214, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.05062, "step": 14790, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009705, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003113 }, { "ce_loss": 2.272734, "epoch": 0.0148, "grad_norm": 0.0011403486132621765, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.124023, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.103027, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050021, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050476, "step": 14800, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.272155, "epoch": 0.01481, "grad_norm": 0.0009986519580706954, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049896, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050323, "step": 14810, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000139, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000237, "vq_loss_layer_026": 0.000359, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.322504, "epoch": 0.01482, "grad_norm": 0.0010989836882799864, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049908, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050336, "step": 14820, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029053, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.329767, "epoch": 0.01483, "grad_norm": 0.001057703047990799, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050183, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050604, "step": 14830, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001678, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.318146, "epoch": 0.01484, "grad_norm": 0.0011036844225600362, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049713, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050137, "step": 14840, "value_mse_loss_layer_000": 0.000431, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000241, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.00023, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001648, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.252646, "epoch": 0.01485, "grad_norm": 0.0011548924958333373, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052246, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050522, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050961, "step": 14850, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.021851, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029175, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.048096, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.00016, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000671, "vq_loss_layer_030": 0.001678, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.28924, "epoch": 0.01486, "grad_norm": 0.001180553692393005, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050012, "kv_vq_loss": 0.000421, "learning_rate": 0.001, "loss": 0.050488, "step": 14860, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000239, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000935, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.269977, "epoch": 0.01487, "grad_norm": 0.0010897687170654535, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049985, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050424, "step": 14870, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.00145, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.287983, "epoch": 0.01488, "grad_norm": 0.0010862749768421054, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.123535, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.117188, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.109375, "key_mse_loss_layer_021": 0.102051, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.104492, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050528, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050961, "step": 14880, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008362, "value_mse_loss_layer_004": 0.008057, "value_mse_loss_layer_005": 0.007416, "value_mse_loss_layer_006": 0.009521, "value_mse_loss_layer_007": 0.009766, "value_mse_loss_layer_008": 0.01178, "value_mse_loss_layer_009": 0.016357, "value_mse_loss_layer_010": 0.013489, "value_mse_loss_layer_011": 0.014099, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020264, "value_mse_loss_layer_020": 0.022339, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7.4e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000226, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.000219, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000301, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000595, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.266616, "epoch": 0.01489, "grad_norm": 0.001342215808108449, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.05014, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050574, "step": 14890, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.000169, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.00018, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000178, "vq_loss_layer_025": 0.000224, "vq_loss_layer_026": 0.000355, "vq_loss_layer_027": 0.000406, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000706, "vq_loss_layer_030": 0.001358, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.270168, "epoch": 0.0149, "grad_norm": 0.0013698798138648272, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050287, "kv_vq_loss": 0.000416, "learning_rate": 0.001, "loss": 0.050748, "step": 14900, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000244, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000192, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.00037, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.002441 }, { "ce_loss": 2.343096, "epoch": 0.01491, "grad_norm": 0.0010530981235206127, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.116211, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.115234, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.101074, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049826, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050253, "step": 14910, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021484, "value_mse_loss_layer_023": 0.023071, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.028076, "value_mse_loss_layer_027": 0.036133, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.046875, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000376, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.00033, "vq_loss_layer_025": 0.000463, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.003754 }, { "ce_loss": 2.297738, "epoch": 0.01492, "grad_norm": 0.0010392230469733477, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.04964, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050067, "step": 14920, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000404, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.325192, "epoch": 0.01493, "grad_norm": 0.001052118488587439, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.112793, "key_mse_loss_layer_016": 0.106445, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.114746, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.106934, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050259, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050681, "step": 14930, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000717, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.292657, "epoch": 0.01494, "grad_norm": 0.0010991041781380773, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049997, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050443, "step": 14940, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000126, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000148, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.00016, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000173, "vq_loss_layer_025": 0.00022, "vq_loss_layer_026": 0.000328, "vq_loss_layer_027": 0.000378, "vq_loss_layer_028": 0.000614, "vq_loss_layer_029": 0.00069, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.002441 }, { "ce_loss": 2.315373, "epoch": 0.01495, "grad_norm": 0.0010270070051774383, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050153, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050571, "step": 14950, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000137, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000168, "vq_loss_layer_010": 0.000149, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000282, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000301, "vq_loss_layer_015": 0.000313, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000205, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000158, "vq_loss_layer_021": 0.000248, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000214, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000228, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.281207, "epoch": 0.01496, "grad_norm": 0.0012052977690473199, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.05957, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050119, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050558, "step": 14960, "value_mse_loss_layer_000": 0.000439, "value_mse_loss_layer_001": 0.001236, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000181, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000203, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000702, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.002365 }, { "ce_loss": 2.243224, "epoch": 0.01497, "grad_norm": 0.0012298148358240724, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.060059, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.099609, "key_mse_loss_layer_029": 0.095215, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.049796, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050208, "step": 14970, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.03418, "value_mse_loss_layer_025": 0.038574, "value_mse_loss_layer_026": 0.036133, "value_mse_loss_layer_027": 0.047852, "value_mse_loss_layer_028": 0.052734, "value_mse_loss_layer_029": 0.0625, "value_mse_loss_layer_030": 0.068359, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000169, "vq_loss_layer_012": 0.000288, "vq_loss_layer_013": 0.000236, "vq_loss_layer_014": 0.000296, "vq_loss_layer_015": 0.000317, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000229, "vq_loss_layer_018": 0.000238, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000146, "vq_loss_layer_021": 0.000218, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000195, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.001289, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.31874, "epoch": 0.01498, "grad_norm": 0.0010508785489946604, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049649, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050046, "step": 14980, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.290231, "epoch": 0.01499, "grad_norm": 0.0013055120361968875, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.055908, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.049783, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050223, "step": 14990, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004486, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000182, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000458, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003479 }, { "ce_loss": 2.297987, "epoch": 0.015, "grad_norm": 0.001081292051821947, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050266, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050674, "step": 15000, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.260455, "epoch": 0.01501, "grad_norm": 0.0012583613861352205, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050491, "kv_vq_loss": 0.000407, "learning_rate": 0.001, "loss": 0.050919, "step": 15010, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001221, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.010071, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.016602, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000223, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000408, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.000252, "vq_loss_layer_021": 0.000402, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.00036, "vq_loss_layer_025": 0.000519, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000671, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.327975, "epoch": 0.01502, "grad_norm": 0.001183548360131681, "key_mse_loss_layer_000": 0.002533, "key_mse_loss_layer_001": 0.009583, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.136719, "key_mse_loss_layer_014": 0.131836, "key_mse_loss_layer_015": 0.120605, "key_mse_loss_layer_016": 0.114258, "key_mse_loss_layer_017": 0.116211, "key_mse_loss_layer_018": 0.122559, "key_mse_loss_layer_019": 0.100098, "key_mse_loss_layer_020": 0.114746, "key_mse_loss_layer_021": 0.107422, "key_mse_loss_layer_022": 0.111328, "key_mse_loss_layer_023": 0.109863, "key_mse_loss_layer_024": 0.084473, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.089844, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050479, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050906, "step": 15020, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.013611, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.020996, "value_mse_loss_layer_023": 0.023804, "value_mse_loss_layer_024": 0.026367, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.028442, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.040039, "value_mse_loss_layer_029": 0.046875, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000206, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000401, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.00023, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000448, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.001129, "vq_loss_layer_029": 0.001968, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.360689, "epoch": 0.01503, "grad_norm": 0.0010843139607459307, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.059814, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.104492, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.050488, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.0509, "step": 15030, "value_mse_loss_layer_000": 0.000435, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000128, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.000148, "vq_loss_layer_011": 0.000169, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000294, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000226, "vq_loss_layer_026": 0.000364, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000614, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001587, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.318806, "epoch": 0.01504, "grad_norm": 0.0011515604564920068, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.05004, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050479, "step": 15040, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000391, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.288823, "epoch": 0.01505, "grad_norm": 0.0011147208278998733, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.04971, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050116, "step": 15050, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.1e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.32695, "epoch": 0.01506, "grad_norm": 0.001203632215037942, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.064453, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049506, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.049918, "step": 15060, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000423, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.302028, "epoch": 0.01507, "grad_norm": 0.001321426359936595, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.100098, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049811, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050238, "step": 15070, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000252, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000218, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.28675, "epoch": 0.01508, "grad_norm": 0.001126859220676124, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050412, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050827, "step": 15080, "value_mse_loss_layer_000": 0.000412, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.01416, "value_mse_loss_layer_012": 0.015625, "value_mse_loss_layer_013": 0.016968, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.02063, "value_mse_loss_layer_016": 0.015869, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000232, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000225, "vq_loss_layer_012": 0.000381, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000492, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000414, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.000284, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000935, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.370337, "epoch": 0.01509, "grad_norm": 0.0011521624401211739, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.049609, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.05, "step": 15090, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007629, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.289148, "epoch": 0.0151, "grad_norm": 0.0012224615784361959, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.132812, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.114258, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050357, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050806, "step": 15100, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009583, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.013306, "value_mse_loss_layer_011": 0.014038, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.00024, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000307, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000477, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001366, "vq_loss_layer_030": 0.003616, "vq_loss_layer_031": 0.003784 }, { "ce_loss": 2.336234, "epoch": 0.01511, "grad_norm": 0.0010723713785409927, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050064, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050494, "step": 15110, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.000408, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.323165, "epoch": 0.01512, "grad_norm": 0.0012809208128601313, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.05015, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050571, "step": 15120, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.013855, "value_mse_loss_layer_012": 0.015015, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000211, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001068, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.276367, "epoch": 0.01513, "grad_norm": 0.0010557695059105754, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050235, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050677, "step": 15130, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000671, "vq_loss_layer_030": 0.001564, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.346045, "epoch": 0.01514, "grad_norm": 0.0010411932598799467, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.049915, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050327, "step": 15140, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.011719, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000389, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.001495, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.292054, "epoch": 0.01515, "grad_norm": 0.0011188475182279944, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050165, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050595, "step": 15150, "value_mse_loss_layer_000": 0.000437, "value_mse_loss_layer_001": 0.001213, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000284, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.296141, "epoch": 0.01516, "grad_norm": 0.0014063309645280242, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.050394, "kv_vq_loss": 0.000422, "learning_rate": 0.001, "loss": 0.050867, "step": 15160, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000137, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.003647 }, { "ce_loss": 2.324037, "epoch": 0.01517, "grad_norm": 0.0012217119801789522, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.053711, "key_mse_loss_layer_004": 0.059814, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.04982, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050253, "step": 15170, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000158, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000206, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000211, "vq_loss_layer_026": 0.000359, "vq_loss_layer_027": 0.000406, "vq_loss_layer_028": 0.000587, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.002243 }, { "ce_loss": 2.279282, "epoch": 0.01518, "grad_norm": 0.0010935741011053324, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050366, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050781, "step": 15180, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004456, "value_mse_loss_layer_003": 0.008789, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.007233, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.062012, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000317, "vq_loss_layer_016": 0.00028, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000303, "vq_loss_layer_023": 0.000355, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.000656, "vq_loss_layer_027": 0.000858, "vq_loss_layer_028": 0.001358, "vq_loss_layer_029": 0.002228, "vq_loss_layer_030": 0.003052, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.290175, "epoch": 0.01519, "grad_norm": 0.0011336588067933917, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050198, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050607, "step": 15190, "value_mse_loss_layer_000": 0.000433, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000486, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001526, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.298668, "epoch": 0.0152, "grad_norm": 0.0011930253822356462, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.050259, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050677, "step": 15200, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000439, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000357, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.31172, "epoch": 0.01521, "grad_norm": 0.0012730822199955583, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049484, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.049915, "step": 15210, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000301, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.311638, "epoch": 0.01522, "grad_norm": 0.001152448239736259, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.060547, "kv_mse_loss": 0.049963, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050394, "step": 15220, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001205, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.300563, "epoch": 0.01523, "grad_norm": 0.0012009210186079144, "key_mse_loss_layer_000": 0.003708, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049951, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050394, "step": 15230, "value_mse_loss_layer_000": 0.000443, "value_mse_loss_layer_001": 0.001228, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000292, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.262264, "epoch": 0.01524, "grad_norm": 0.0012535040732473135, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.07959, "kv_mse_loss": 0.050055, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050482, "step": 15240, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003403 }, { "ce_loss": 2.309463, "epoch": 0.01525, "grad_norm": 0.0009660770301707089, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.060791, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050186, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.05062, "step": 15250, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004517, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.260226, "epoch": 0.01526, "grad_norm": 0.0012704564724117517, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050217, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050656, "step": 15260, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025269, "value_mse_loss_layer_023": 0.028442, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033936, "value_mse_loss_layer_027": 0.043701, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.301933, "epoch": 0.01527, "grad_norm": 0.0012186202220618725, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050104, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050534, "step": 15270, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000236, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000429, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000239, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.311995, "epoch": 0.01528, "grad_norm": 0.001081627095118165, "key_mse_loss_layer_000": 0.003906, "key_mse_loss_layer_001": 0.011353, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.049994, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050409, "step": 15280, "value_mse_loss_layer_000": 0.000431, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.061768, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002792, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.305372, "epoch": 0.01529, "grad_norm": 0.0011329245753586292, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.060059, "kv_mse_loss": 0.050238, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050653, "step": 15290, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000222, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000454, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000362, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.00023, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.001068, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.004517 }, { "ce_loss": 2.283957, "epoch": 0.0153, "grad_norm": 0.001423701411113143, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050186, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050626, "step": 15300, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.290746, "epoch": 0.01531, "grad_norm": 0.0011215652339160442, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050159, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050595, "step": 15310, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000395, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000217, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000408, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.001221, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.330407, "epoch": 0.01532, "grad_norm": 0.0010517312912270427, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.111328, "key_mse_loss_layer_016": 0.103516, "key_mse_loss_layer_017": 0.10498, "key_mse_loss_layer_018": 0.110352, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049847, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050269, "step": 15320, "value_mse_loss_layer_000": 0.000412, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000189, "vq_loss_layer_009": 0.00023, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000425, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000271, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000355, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.290066, "epoch": 0.01533, "grad_norm": 0.0011714244028553367, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.094238, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049838, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050241, "step": 15330, "value_mse_loss_layer_000": 0.000431, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.317724, "epoch": 0.01534, "grad_norm": 0.0009907333878800273, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049677, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050098, "step": 15340, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.297296, "epoch": 0.01535, "grad_norm": 0.0014507020823657513, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049854, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050278, "step": 15350, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.294365, "epoch": 0.01536, "grad_norm": 0.0011170553043484688, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.050378, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050812, "step": 15360, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.312634, "epoch": 0.01537, "grad_norm": 0.001172623597085476, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050055, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.05047, "step": 15370, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.283117, "epoch": 0.01538, "grad_norm": 0.0010640873806551099, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.059082, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.05004, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050461, "step": 15380, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.0002, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000408, "vq_loss_layer_028": 0.000607, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.246924, "epoch": 0.01539, "grad_norm": 0.001256916904821992, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.103516, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050125, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050546, "step": 15390, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.009033, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.013184, "value_mse_loss_layer_011": 0.014343, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017456, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.1e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000211, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.00042, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.287185, "epoch": 0.0154, "grad_norm": 0.001052618958055973, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.049683, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050107, "step": 15400, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001541, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.288668, "epoch": 0.01541, "grad_norm": 0.001223606988787651, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.049783, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050235, "step": 15410, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.2e-05, "vq_loss_layer_007": 0.000132, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000218, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.325499, "epoch": 0.01542, "grad_norm": 0.0011062659323215485, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049734, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050156, "step": 15420, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.351896, "epoch": 0.01543, "grad_norm": 0.0011439595837146044, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049771, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050183, "step": 15430, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000236, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000257, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000199, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000199, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000353, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.353649, "epoch": 0.01544, "grad_norm": 0.001335945911705494, "key_mse_loss_layer_000": 0.003723, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049713, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050128, "step": 15440, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013794, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.00022, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000397, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.00023, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.29101, "epoch": 0.01545, "grad_norm": 0.0010335822589695454, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.103027, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.104004, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086426, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.049872, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050317, "step": 15450, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000319, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.295948, "epoch": 0.01546, "grad_norm": 0.001011806190945208, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049875, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050269, "step": 15460, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.292788, "epoch": 0.01547, "grad_norm": 0.0011950638145208359, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.055908, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.049811, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050244, "step": 15470, "value_mse_loss_layer_000": 0.000412, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000271, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.000961, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.284792, "epoch": 0.01548, "grad_norm": 0.0011114423396065831, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050275, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050702, "step": 15480, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000181, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000326, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.317824, "epoch": 0.01549, "grad_norm": 0.0010283144656568766, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103516, "key_mse_loss_layer_015": 0.092773, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049911, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050327, "step": 15490, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000128, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.00017, "vq_loss_layer_012": 0.000275, "vq_loss_layer_013": 0.00024, "vq_loss_layer_014": 0.000298, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000158, "vq_loss_layer_021": 0.000259, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000212, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.30725, "epoch": 0.0155, "grad_norm": 0.0010635824874043465, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049783, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050195, "step": 15500, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000389, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.270662, "epoch": 0.01551, "grad_norm": 0.0012772688642144203, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.049948, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050406, "step": 15510, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000208, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000458, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.309728, "epoch": 0.01552, "grad_norm": 0.001043648924678564, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050089, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050504, "step": 15520, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.00017, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000187, "vq_loss_layer_025": 0.000228, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.001541, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.322606, "epoch": 0.01553, "grad_norm": 0.0011326425010338426, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049658, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050067, "step": 15530, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000164, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.00036, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000265, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.380025, "epoch": 0.01554, "grad_norm": 0.0012096636928617954, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049768, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050177, "step": 15540, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015442, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000175, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000248, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.341574, "epoch": 0.01555, "grad_norm": 0.0010859358590096235, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.049664, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050098, "step": 15550, "value_mse_loss_layer_000": 0.000431, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000969, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.263078, "epoch": 0.01556, "grad_norm": 0.001053964952006936, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050134, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050568, "step": 15560, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.35289, "epoch": 0.01557, "grad_norm": 0.0010768193751573563, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049445, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049854, "step": 15570, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.323373, "epoch": 0.01558, "grad_norm": 0.0011227609356865287, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.05957, "kv_mse_loss": 0.049561, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.049969, "step": 15580, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000435, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000378, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.001228, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.311567, "epoch": 0.01559, "grad_norm": 0.0010006084339693189, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049796, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050226, "step": 15590, "value_mse_loss_layer_000": 0.000431, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000267, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.343716, "epoch": 0.0156, "grad_norm": 0.0010551969753578305, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.04982, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050229, "step": 15600, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000213, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.307923, "epoch": 0.01561, "grad_norm": 0.0011520881671458483, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.074707, "key_mse_loss_layer_031": 0.05835, "kv_mse_loss": 0.050177, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050601, "step": 15610, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000139, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000305, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000927, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.003799 }, { "ce_loss": 2.298001, "epoch": 0.01562, "grad_norm": 0.0012694833567366004, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050015, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.05043, "step": 15620, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000206, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000232, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.290324, "epoch": 0.01563, "grad_norm": 0.001198986661620438, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049704, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050104, "step": 15630, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000412, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000736, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.290633, "epoch": 0.01564, "grad_norm": 0.0011569163762032986, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049857, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050275, "step": 15640, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.025146, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.032227, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000233, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000591, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.002411 }, { "ce_loss": 2.336919, "epoch": 0.01565, "grad_norm": 0.0011249303352087736, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049832, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.05025, "step": 15650, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000656, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.35112, "epoch": 0.01566, "grad_norm": 0.0009827995672821999, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049573, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049954, "step": 15660, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000185, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000215, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.36231, "epoch": 0.01567, "grad_norm": 0.0011929749744012952, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.12793, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.114258, "key_mse_loss_layer_016": 0.106445, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.094238, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.095215, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050055, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050494, "step": 15670, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001732, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.304897, "epoch": 0.01568, "grad_norm": 0.0010665088193491101, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049777, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050195, "step": 15680, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000175, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.00032, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000154, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000188, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.00023, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000587, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.001572, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.281079, "epoch": 0.01569, "grad_norm": 0.0012452383525669575, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050046, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050452, "step": 15690, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000195, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000664, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002289, "vq_loss_layer_031": 0.003464 }, { "ce_loss": 2.348388, "epoch": 0.0157, "grad_norm": 0.0011644653277471662, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.107422, "key_mse_loss_layer_017": 0.105957, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.092773, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049875, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050266, "step": 15700, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.004395, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.051025, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000191, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000217, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000298, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000475, "vq_loss_layer_026": 0.00058, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.001091, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002625, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.311123, "epoch": 0.01571, "grad_norm": 0.0011254013516008854, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.050122, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050519, "step": 15710, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000858, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.271284, "epoch": 0.01572, "grad_norm": 0.0010661876294761896, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.141602, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.120605, "key_mse_loss_layer_016": 0.116211, "key_mse_loss_layer_017": 0.116699, "key_mse_loss_layer_018": 0.122559, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.110352, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.109863, "key_mse_loss_layer_023": 0.105469, "key_mse_loss_layer_024": 0.08252, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.098145, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049835, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050256, "step": 15720, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009644, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.023438, "value_mse_loss_layer_024": 0.026001, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.047607, "value_mse_loss_layer_030": 0.052734, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.000225, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000334, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000201, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000584, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.360647, "epoch": 0.01573, "grad_norm": 0.001126041985116899, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049701, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050076, "step": 15730, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000486, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.001625, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.296633, "epoch": 0.01574, "grad_norm": 0.0011157408589497209, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.061035, "kv_mse_loss": 0.0505, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050879, "step": 15740, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.047363, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000334, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000839, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.359858, "epoch": 0.01575, "grad_norm": 0.0012516911374405026, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049866, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.05029, "step": 15750, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000383, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.31168, "epoch": 0.01576, "grad_norm": 0.0011406107805669308, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.091309, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.110352, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.135742, "key_mse_loss_layer_014": 0.132812, "key_mse_loss_layer_015": 0.122559, "key_mse_loss_layer_016": 0.115723, "key_mse_loss_layer_017": 0.116211, "key_mse_loss_layer_018": 0.12207, "key_mse_loss_layer_019": 0.097168, "key_mse_loss_layer_020": 0.112793, "key_mse_loss_layer_021": 0.105957, "key_mse_loss_layer_022": 0.11084, "key_mse_loss_layer_023": 0.105469, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.079102, "key_mse_loss_layer_026": 0.094238, "key_mse_loss_layer_027": 0.089844, "key_mse_loss_layer_028": 0.098633, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049945, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050372, "step": 15760, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013367, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.014648, "value_mse_loss_layer_019": 0.017212, "value_mse_loss_layer_020": 0.019043, "value_mse_loss_layer_021": 0.020996, "value_mse_loss_layer_022": 0.02124, "value_mse_loss_layer_023": 0.022217, "value_mse_loss_layer_024": 0.025635, "value_mse_loss_layer_025": 0.03125, "value_mse_loss_layer_026": 0.026489, "value_mse_loss_layer_027": 0.035645, "value_mse_loss_layer_028": 0.038818, "value_mse_loss_layer_029": 0.043945, "value_mse_loss_layer_030": 0.049805, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000195, "vq_loss_layer_009": 0.000242, "vq_loss_layer_010": 0.000214, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.000427, "vq_loss_layer_025": 0.000576, "vq_loss_layer_026": 0.000568, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.337453, "epoch": 0.01577, "grad_norm": 0.0010361120803281665, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049826, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.050201, "step": 15770, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.29193, "epoch": 0.01578, "grad_norm": 0.0011823374079540372, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050159, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050586, "step": 15780, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.304058, "epoch": 0.01579, "grad_norm": 0.00108536914922297, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049969, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050391, "step": 15790, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.316164, "epoch": 0.0158, "grad_norm": 0.0011738961329683661, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049747, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050143, "step": 15800, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.298218, "epoch": 0.01581, "grad_norm": 0.001232215785421431, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050223, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050665, "step": 15810, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013245, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000157, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000204, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.263656, "epoch": 0.01582, "grad_norm": 0.001059150556102395, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050537, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050952, "step": 15820, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.272506, "epoch": 0.01583, "grad_norm": 0.0011119537521153688, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050168, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050583, "step": 15830, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000446, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001846, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.358731, "epoch": 0.01584, "grad_norm": 0.0010691573843359947, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049814, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050223, "step": 15840, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001167, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.339599, "epoch": 0.01585, "grad_norm": 0.0010401617037132382, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050089, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050507, "step": 15850, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.008911, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000117, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000179, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.000234, "vq_loss_layer_019": 0.000218, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000397, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.301237, "epoch": 0.01586, "grad_norm": 0.001331845298409462, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050034, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050482, "step": 15860, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.003357 }, { "ce_loss": 2.262366, "epoch": 0.01587, "grad_norm": 0.001021023141220212, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.094238, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.074219, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.049982, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050394, "step": 15870, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.001732, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.341123, "epoch": 0.01588, "grad_norm": 0.001053741667419672, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.04989, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050287, "step": 15880, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.02478, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.028442, "value_mse_loss_layer_027": 0.036377, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.052002, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000159, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000717, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.315467, "epoch": 0.01589, "grad_norm": 0.0013981516240164638, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.061523, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.050201, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050623, "step": 15890, "value_mse_loss_layer_000": 0.000412, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000128, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000267, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000212, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.002518, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.309575, "epoch": 0.0159, "grad_norm": 0.0010367134818807244, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049966, "kv_vq_loss": 0.000412, "learning_rate": 0.001, "loss": 0.050412, "step": 15900, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000481, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000215, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.247372, "epoch": 0.01591, "grad_norm": 0.0010479066986590624, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.109863, "key_mse_loss_layer_017": 0.110352, "key_mse_loss_layer_018": 0.116211, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.108887, "key_mse_loss_layer_021": 0.103027, "key_mse_loss_layer_022": 0.106934, "key_mse_loss_layer_023": 0.103516, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.09668, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.09668, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049875, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050272, "step": 15910, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000744, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.335926, "epoch": 0.01592, "grad_norm": 0.0011565667809918523, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.049902, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050339, "step": 15920, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.056641, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000246, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000158, "vq_loss_layer_021": 0.000256, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000208, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.286248, "epoch": 0.01593, "grad_norm": 0.0012102832552045584, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.060303, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.121582, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.05011, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050543, "step": 15930, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.016235, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000178, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000391, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000269, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000364, "vq_loss_layer_023": 0.000353, "vq_loss_layer_024": 0.000397, "vq_loss_layer_025": 0.000576, "vq_loss_layer_026": 0.000648, "vq_loss_layer_027": 0.000793, "vq_loss_layer_028": 0.001007, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.004089 }, { "ce_loss": 2.355877, "epoch": 0.01594, "grad_norm": 0.0012266739504411817, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.049908, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050342, "step": 15940, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000404, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000233, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.00028, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.283494, "epoch": 0.01595, "grad_norm": 0.0013660159893333912, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.081543, "key_mse_loss_layer_008": 0.093262, "key_mse_loss_layer_009": 0.100098, "key_mse_loss_layer_010": 0.115234, "key_mse_loss_layer_011": 0.112305, "key_mse_loss_layer_012": 0.083496, "key_mse_loss_layer_013": 0.146484, "key_mse_loss_layer_014": 0.141602, "key_mse_loss_layer_015": 0.125977, "key_mse_loss_layer_016": 0.123047, "key_mse_loss_layer_017": 0.121582, "key_mse_loss_layer_018": 0.129883, "key_mse_loss_layer_019": 0.103027, "key_mse_loss_layer_020": 0.116699, "key_mse_loss_layer_021": 0.11084, "key_mse_loss_layer_022": 0.114746, "key_mse_loss_layer_023": 0.11377, "key_mse_loss_layer_024": 0.089844, "key_mse_loss_layer_025": 0.083984, "key_mse_loss_layer_026": 0.101562, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.106934, "key_mse_loss_layer_029": 0.094727, "key_mse_loss_layer_030": 0.10791, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050201, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050616, "step": 15950, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.00708, "value_mse_loss_layer_006": 0.009094, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.02417, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000202, "vq_loss_layer_009": 0.000246, "vq_loss_layer_010": 0.000219, "vq_loss_layer_011": 0.000233, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000404, "vq_loss_layer_014": 0.000427, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000226, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000429, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000614, "vq_loss_layer_028": 0.000961, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.284165, "epoch": 0.01596, "grad_norm": 0.0013151487801223993, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050266, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050693, "step": 15960, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.002747, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.287586, "epoch": 0.01597, "grad_norm": 0.001042294199578464, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.049765, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050165, "step": 15970, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.285386, "epoch": 0.01598, "grad_norm": 0.0013030656846240163, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.074707, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.057861, "kv_mse_loss": 0.049927, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050314, "step": 15980, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000217, "vq_loss_layer_021": 0.000372, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.001587, "vq_loss_layer_031": 0.003464 }, { "ce_loss": 2.306703, "epoch": 0.01599, "grad_norm": 0.0010569997830316424, "key_mse_loss_layer_000": 0.003738, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.050226, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050638, "step": 15990, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000408, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000626, "vq_loss_layer_030": 0.00164, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.320996, "epoch": 0.016, "grad_norm": 0.0014169777277857065, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050443, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.05087, "step": 16000, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.297531, "epoch": 0.01601, "grad_norm": 0.0010509133571758866, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050061, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050455, "step": 16010, "value_mse_loss_layer_000": 0.000412, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000193, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000412, "vq_loss_layer_028": 0.000584, "vq_loss_layer_029": 0.000656, "vq_loss_layer_030": 0.001564, "vq_loss_layer_031": 0.002304 }, { "ce_loss": 2.299838, "epoch": 0.01602, "grad_norm": 0.0010471794521436095, "key_mse_loss_layer_000": 0.004913, "key_mse_loss_layer_001": 0.011719, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.049863, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050278, "step": 16020, "value_mse_loss_layer_000": 0.00046, "value_mse_loss_layer_001": 0.001198, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 6e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.00012, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000315, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000187, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.316684, "epoch": 0.01603, "grad_norm": 0.0012890922371298075, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049744, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050186, "step": 16030, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.320005, "epoch": 0.01604, "grad_norm": 0.0011155925458297133, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.049805, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050226, "step": 16040, "value_mse_loss_layer_000": 0.000425, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000211, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.315694, "epoch": 0.01605, "grad_norm": 0.0010462907375767827, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.049738, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050128, "step": 16050, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000437, "vq_loss_layer_022": 0.000273, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000942, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.339801, "epoch": 0.01606, "grad_norm": 0.0012090427335351706, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050092, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.0505, "step": 16060, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000185, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.311804, "epoch": 0.01607, "grad_norm": 0.0010656636441126466, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.011108, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.09375, "key_mse_loss_layer_028": 0.098145, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.098145, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.049811, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050208, "step": 16070, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.00116, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014404, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013184, "value_mse_loss_layer_013": 0.014282, "value_mse_loss_layer_014": 0.015198, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.013794, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.043945, "value_mse_loss_layer_028": 0.048096, "value_mse_loss_layer_029": 0.057373, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.05127, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000237, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000311, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000257, "vq_loss_layer_018": 0.000214, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.352107, "epoch": 0.01608, "grad_norm": 0.0011541658313944936, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049615, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050037, "step": 16080, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.32113, "epoch": 0.01609, "grad_norm": 0.0010716503020375967, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049826, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050241, "step": 16090, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001633, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.323858, "epoch": 0.0161, "grad_norm": 0.0011763995280489326, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049915, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050327, "step": 16100, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000353, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.350648, "epoch": 0.01611, "grad_norm": 0.000991593231447041, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049683, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050073, "step": 16110, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000137, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000213, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.347792, "epoch": 0.01612, "grad_norm": 0.0012317884247750044, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049689, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050082, "step": 16120, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000345, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000232, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000402, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.003189 }, { "ce_loss": 2.330502, "epoch": 0.01613, "grad_norm": 0.0011437104549258947, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.049963, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050403, "step": 16130, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000355, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000671, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.268766, "epoch": 0.01614, "grad_norm": 0.0011781357461586595, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.04957, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049954, "step": 16140, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000211, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.301056, "epoch": 0.01615, "grad_norm": 0.0010761682642623782, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.050104, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050534, "step": 16150, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.2747, "epoch": 0.01616, "grad_norm": 0.0011404401157051325, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.115234, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.106934, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.081055, "key_mse_loss_layer_026": 0.09375, "key_mse_loss_layer_027": 0.091797, "key_mse_loss_layer_028": 0.101562, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050259, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050674, "step": 16160, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004425, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000355, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.001106, "vq_loss_layer_029": 0.000923, "vq_loss_layer_030": 0.002335, "vq_loss_layer_031": 0.003677 }, { "ce_loss": 2.28096, "epoch": 0.01617, "grad_norm": 0.001131411292590201, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.011841, "key_mse_loss_layer_002": 0.063965, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.072754, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.091309, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.110352, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.080566, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.118164, "key_mse_loss_layer_016": 0.112305, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.117188, "key_mse_loss_layer_019": 0.097656, "key_mse_loss_layer_020": 0.109863, "key_mse_loss_layer_021": 0.104004, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.109375, "key_mse_loss_layer_024": 0.09082, "key_mse_loss_layer_025": 0.084961, "key_mse_loss_layer_026": 0.101562, "key_mse_loss_layer_027": 0.105469, "key_mse_loss_layer_028": 0.10791, "key_mse_loss_layer_029": 0.102539, "key_mse_loss_layer_030": 0.11084, "key_mse_loss_layer_031": 0.084961, "kv_mse_loss": 0.050067, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050485, "step": 16170, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.008423, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014404, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013123, "value_mse_loss_layer_013": 0.014282, "value_mse_loss_layer_014": 0.015137, "value_mse_loss_layer_015": 0.015869, "value_mse_loss_layer_016": 0.01355, "value_mse_loss_layer_017": 0.016724, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.044678, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.064941, "value_mse_loss_layer_031": 0.053223, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000231, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000322, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000462, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000801, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.001724, "vq_loss_layer_030": 0.00296, "vq_loss_layer_031": 0.0047 }, { "ce_loss": 2.337207, "epoch": 0.01618, "grad_norm": 0.0011370296124368906, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010986, "key_mse_loss_layer_002": 0.0625, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.073242, "key_mse_loss_layer_007": 0.081055, "key_mse_loss_layer_008": 0.092285, "key_mse_loss_layer_009": 0.097168, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.107422, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.138672, "key_mse_loss_layer_014": 0.133789, "key_mse_loss_layer_015": 0.119629, "key_mse_loss_layer_016": 0.115723, "key_mse_loss_layer_017": 0.117676, "key_mse_loss_layer_018": 0.12793, "key_mse_loss_layer_019": 0.102051, "key_mse_loss_layer_020": 0.114258, "key_mse_loss_layer_021": 0.112305, "key_mse_loss_layer_022": 0.111816, "key_mse_loss_layer_023": 0.10791, "key_mse_loss_layer_024": 0.085938, "key_mse_loss_layer_025": 0.084473, "key_mse_loss_layer_026": 0.097656, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.102051, "key_mse_loss_layer_029": 0.09082, "key_mse_loss_layer_030": 0.097656, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050079, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050491, "step": 16180, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007355, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009827, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.013, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014832, "value_mse_loss_layer_013": 0.017334, "value_mse_loss_layer_014": 0.017822, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.015747, "value_mse_loss_layer_017": 0.020142, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.039062, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.062256, "value_mse_loss_layer_031": 0.052246, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.3e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000187, "vq_loss_layer_009": 0.000271, "vq_loss_layer_010": 0.000229, "vq_loss_layer_011": 0.000219, "vq_loss_layer_012": 0.00038, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.00045, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000368, "vq_loss_layer_017": 0.000364, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000205, "vq_loss_layer_020": 0.000273, "vq_loss_layer_021": 0.000454, "vq_loss_layer_022": 0.000326, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000351, "vq_loss_layer_025": 0.000626, "vq_loss_layer_026": 0.000748, "vq_loss_layer_027": 0.000832, "vq_loss_layer_028": 0.001289, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002808, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.29204, "epoch": 0.01619, "grad_norm": 0.0011093675857409835, "key_mse_loss_layer_000": 0.003738, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.050012, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050436, "step": 16190, "value_mse_loss_layer_000": 0.000427, "value_mse_loss_layer_001": 0.001175, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007812, "value_mse_loss_layer_005": 0.007202, "value_mse_loss_layer_006": 0.009216, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011597, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.037354, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000188, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.003403 }, { "ce_loss": 2.306814, "epoch": 0.0162, "grad_norm": 0.0012154196156188846, "key_mse_loss_layer_000": 0.002686, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.109863, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.138672, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.121094, "key_mse_loss_layer_016": 0.115234, "key_mse_loss_layer_017": 0.117676, "key_mse_loss_layer_018": 0.121094, "key_mse_loss_layer_019": 0.097656, "key_mse_loss_layer_020": 0.111816, "key_mse_loss_layer_021": 0.105957, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.106934, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.095215, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.098145, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049933, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050354, "step": 16200, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.014404, "value_mse_loss_layer_019": 0.01709, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.020874, "value_mse_loss_layer_022": 0.020874, "value_mse_loss_layer_023": 0.022339, "value_mse_loss_layer_024": 0.025146, "value_mse_loss_layer_025": 0.030396, "value_mse_loss_layer_026": 0.027222, "value_mse_loss_layer_027": 0.033936, "value_mse_loss_layer_028": 0.039062, "value_mse_loss_layer_029": 0.043701, "value_mse_loss_layer_030": 0.048828, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 4.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.00022, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000393, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000389, "vq_loss_layer_017": 0.000431, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000368, "vq_loss_layer_022": 0.000322, "vq_loss_layer_023": 0.000328, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.325686, "epoch": 0.01621, "grad_norm": 0.0010121279628947377, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.102539, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049725, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050134, "step": 16210, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.024292, "value_mse_loss_layer_024": 0.026489, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.027832, "value_mse_loss_layer_027": 0.036133, "value_mse_loss_layer_028": 0.040527, "value_mse_loss_layer_029": 0.047119, "value_mse_loss_layer_030": 0.05127, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.241034, "epoch": 0.01622, "grad_norm": 0.0013872324489057064, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050311, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050745, "step": 16220, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.013672, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.002441, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.283313, "epoch": 0.01623, "grad_norm": 0.0010742051526904106, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.050153, "kv_vq_loss": 0.000415, "learning_rate": 0.001, "loss": 0.050592, "step": 16230, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000444, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000416, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.347395, "epoch": 0.01624, "grad_norm": 0.001178553095087409, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.0495, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049893, "step": 16240, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.001167, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.279397, "epoch": 0.01625, "grad_norm": 0.0011091881897300482, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050311, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050726, "step": 16250, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.318888, "epoch": 0.01626, "grad_norm": 0.0011001266539096832, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.05018, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050623, "step": 16260, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.309995, "epoch": 0.01627, "grad_norm": 0.0011698756134137511, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049905, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050293, "step": 16270, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000431, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.28547, "epoch": 0.01628, "grad_norm": 0.0012438774574548006, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050116, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.05054, "step": 16280, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000399, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.000212, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000378, "vq_loss_layer_028": 0.001259, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.00145, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.323652, "epoch": 0.01629, "grad_norm": 0.0010255364468321204, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050067, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050497, "step": 16290, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 4.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000132, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000146, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000284, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000187, "vq_loss_layer_023": 0.000197, "vq_loss_layer_024": 0.000185, "vq_loss_layer_025": 0.000219, "vq_loss_layer_026": 0.000324, "vq_loss_layer_027": 0.000404, "vq_loss_layer_028": 0.000557, "vq_loss_layer_029": 0.000706, "vq_loss_layer_030": 0.001427, "vq_loss_layer_031": 0.002319 }, { "ce_loss": 2.307854, "epoch": 0.0163, "grad_norm": 0.0010725697502493858, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.124023, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049686, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050104, "step": 16300, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.024048, "value_mse_loss_layer_024": 0.026611, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.036133, "value_mse_loss_layer_028": 0.041016, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000904, "vq_loss_layer_029": 0.001297, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.004425 }, { "ce_loss": 2.325434, "epoch": 0.01631, "grad_norm": 0.0009630410349927843, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.058594, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049734, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050146, "step": 16310, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000153, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000301, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000133, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000187, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.00019, "vq_loss_layer_025": 0.000243, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.323285, "epoch": 0.01632, "grad_norm": 0.0011779097840189934, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050189, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.05058, "step": 16320, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006897, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.028931, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.03418, "value_mse_loss_layer_027": 0.044922, "value_mse_loss_layer_028": 0.050293, "value_mse_loss_layer_029": 0.05835, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000159, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.00135, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.326655, "epoch": 0.01633, "grad_norm": 0.0010452567366883159, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.093262, "key_mse_loss_layer_009": 0.101074, "key_mse_loss_layer_010": 0.114258, "key_mse_loss_layer_011": 0.109863, "key_mse_loss_layer_012": 0.082031, "key_mse_loss_layer_013": 0.147461, "key_mse_loss_layer_014": 0.144531, "key_mse_loss_layer_015": 0.129883, "key_mse_loss_layer_016": 0.12793, "key_mse_loss_layer_017": 0.125977, "key_mse_loss_layer_018": 0.131836, "key_mse_loss_layer_019": 0.106445, "key_mse_loss_layer_020": 0.123047, "key_mse_loss_layer_021": 0.116211, "key_mse_loss_layer_022": 0.124023, "key_mse_loss_layer_023": 0.124023, "key_mse_loss_layer_024": 0.098145, "key_mse_loss_layer_025": 0.091309, "key_mse_loss_layer_026": 0.109375, "key_mse_loss_layer_027": 0.104492, "key_mse_loss_layer_028": 0.114258, "key_mse_loss_layer_029": 0.099609, "key_mse_loss_layer_030": 0.117676, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.050165, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050558, "step": 16330, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.016235, "value_mse_loss_layer_016": 0.013184, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.014648, "value_mse_loss_layer_019": 0.017334, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021484, "value_mse_loss_layer_023": 0.024292, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000232, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000402, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000294, "vq_loss_layer_023": 0.000319, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000429, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003464 }, { "ce_loss": 2.299767, "epoch": 0.01634, "grad_norm": 0.001237712800502777, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.011047, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.101074, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.091309, "key_mse_loss_layer_027": 0.097656, "key_mse_loss_layer_028": 0.100098, "key_mse_loss_layer_029": 0.099121, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.049963, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050394, "step": 16340, "value_mse_loss_layer_000": 0.000422, "value_mse_loss_layer_001": 0.00119, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.011536, "value_mse_loss_layer_011": 0.012207, "value_mse_loss_layer_012": 0.013184, "value_mse_loss_layer_013": 0.014465, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.021973, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.025879, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.034668, "value_mse_loss_layer_025": 0.039795, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.055176, "value_mse_loss_layer_029": 0.062988, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000135, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.000146, "vq_loss_layer_011": 0.00017, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000226, "vq_loss_layer_014": 0.00029, "vq_loss_layer_015": 0.000307, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.000235, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000847, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.287481, "epoch": 0.01635, "grad_norm": 0.0010693313088268042, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085449, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.059326, "kv_mse_loss": 0.049799, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050195, "step": 16350, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000189, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000345, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.003632 }, { "ce_loss": 2.324118, "epoch": 0.01636, "grad_norm": 0.0010272590443491936, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049948, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050342, "step": 16360, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.299502, "epoch": 0.01637, "grad_norm": 0.001254043192602694, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050098, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050522, "step": 16370, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000351, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000234, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.337941, "epoch": 0.01638, "grad_norm": 0.0009414549567736685, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.011475, "key_mse_loss_layer_002": 0.06543, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.069336, "key_mse_loss_layer_006": 0.07373, "key_mse_loss_layer_007": 0.08252, "key_mse_loss_layer_008": 0.098633, "key_mse_loss_layer_009": 0.102051, "key_mse_loss_layer_010": 0.115723, "key_mse_loss_layer_011": 0.112305, "key_mse_loss_layer_012": 0.084961, "key_mse_loss_layer_013": 0.140625, "key_mse_loss_layer_014": 0.134766, "key_mse_loss_layer_015": 0.123535, "key_mse_loss_layer_016": 0.118652, "key_mse_loss_layer_017": 0.114258, "key_mse_loss_layer_018": 0.126953, "key_mse_loss_layer_019": 0.103516, "key_mse_loss_layer_020": 0.118164, "key_mse_loss_layer_021": 0.110352, "key_mse_loss_layer_022": 0.119141, "key_mse_loss_layer_023": 0.119629, "key_mse_loss_layer_024": 0.098633, "key_mse_loss_layer_025": 0.089355, "key_mse_loss_layer_026": 0.108887, "key_mse_loss_layer_027": 0.106934, "key_mse_loss_layer_028": 0.114258, "key_mse_loss_layer_029": 0.099609, "key_mse_loss_layer_030": 0.116211, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.050085, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050504, "step": 16380, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007538, "value_mse_loss_layer_005": 0.006836, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.01416, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016357, "value_mse_loss_layer_016": 0.013367, "value_mse_loss_layer_017": 0.016113, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.017334, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.020874, "value_mse_loss_layer_022": 0.02124, "value_mse_loss_layer_023": 0.023926, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 6.3e-05, "vq_loss_layer_005": 7.6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000242, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.00038, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.00038, "vq_loss_layer_017": 0.000248, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000311, "vq_loss_layer_023": 0.000326, "vq_loss_layer_024": 0.000444, "vq_loss_layer_025": 0.000629, "vq_loss_layer_026": 0.000645, "vq_loss_layer_027": 0.000999, "vq_loss_layer_028": 0.001328, "vq_loss_layer_029": 0.001236, "vq_loss_layer_030": 0.003998, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.305963, "epoch": 0.01639, "grad_norm": 0.001179132261313498, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049973, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050345, "step": 16390, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.00148, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.333884, "epoch": 0.0164, "grad_norm": 0.0011233818950131536, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.12207, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.098145, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050143, "kv_vq_loss": 0.00041, "learning_rate": 0.001, "loss": 0.050583, "step": 16400, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.007019, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014587, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.0002, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000242, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000444, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.001221, "vq_loss_layer_030": 0.002487, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.248029, "epoch": 0.01641, "grad_norm": 0.0012292409082874656, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.142578, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.123535, "key_mse_loss_layer_016": 0.120117, "key_mse_loss_layer_017": 0.12207, "key_mse_loss_layer_018": 0.128906, "key_mse_loss_layer_019": 0.103516, "key_mse_loss_layer_020": 0.120605, "key_mse_loss_layer_021": 0.112793, "key_mse_loss_layer_022": 0.118164, "key_mse_loss_layer_023": 0.118164, "key_mse_loss_layer_024": 0.092285, "key_mse_loss_layer_025": 0.085938, "key_mse_loss_layer_026": 0.100098, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.102051, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050101, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050543, "step": 16410, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007751, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009521, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.021606, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 6.2e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.000233, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000303, "vq_loss_layer_025": 0.000538, "vq_loss_layer_026": 0.00061, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.00103, "vq_loss_layer_029": 0.001396, "vq_loss_layer_030": 0.002975, "vq_loss_layer_031": 0.003891 }, { "ce_loss": 2.291209, "epoch": 0.01642, "grad_norm": 0.0011382828233763576, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049774, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050171, "step": 16420, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000144, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000427, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.309052, "epoch": 0.01643, "grad_norm": 0.001177447265945375, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.060303, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.097168, "key_mse_loss_layer_010": 0.108887, "key_mse_loss_layer_011": 0.106934, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.116699, "key_mse_loss_layer_016": 0.107422, "key_mse_loss_layer_017": 0.109863, "key_mse_loss_layer_018": 0.115234, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049719, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050116, "step": 16430, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.014465, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.017456, "value_mse_loss_layer_020": 0.018677, "value_mse_loss_layer_021": 0.02124, "value_mse_loss_layer_022": 0.021973, "value_mse_loss_layer_023": 0.022217, "value_mse_loss_layer_024": 0.026245, "value_mse_loss_layer_025": 0.031128, "value_mse_loss_layer_026": 0.027222, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.040283, "value_mse_loss_layer_029": 0.046631, "value_mse_loss_layer_030": 0.052002, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.8e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 8.7e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000182, "vq_loss_layer_008": 0.000204, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000239, "vq_loss_layer_011": 0.000294, "vq_loss_layer_012": 0.000364, "vq_loss_layer_013": 0.00034, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000473, "vq_loss_layer_016": 0.000378, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000319, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000366, "vq_loss_layer_025": 0.000507, "vq_loss_layer_026": 0.000622, "vq_loss_layer_027": 0.000683, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.000969, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.315605, "epoch": 0.01644, "grad_norm": 0.0009927048813551664, "key_mse_loss_layer_000": 0.003677, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049725, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050131, "step": 16440, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.346331, "epoch": 0.01645, "grad_norm": 0.001188088906928897, "key_mse_loss_layer_000": 0.00264, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.044189, "key_mse_loss_layer_004": 0.041504, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.097656, "key_mse_loss_layer_010": 0.111816, "key_mse_loss_layer_011": 0.106934, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.144531, "key_mse_loss_layer_014": 0.139648, "key_mse_loss_layer_015": 0.123047, "key_mse_loss_layer_016": 0.117188, "key_mse_loss_layer_017": 0.117676, "key_mse_loss_layer_018": 0.121094, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.111328, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.106445, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.092773, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.04986, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050241, "step": 16450, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.013367, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.014404, "value_mse_loss_layer_019": 0.017334, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.020386, "value_mse_loss_layer_023": 0.023804, "value_mse_loss_layer_024": 0.025391, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.028442, "value_mse_loss_layer_027": 0.035889, "value_mse_loss_layer_028": 0.040039, "value_mse_loss_layer_029": 0.04541, "value_mse_loss_layer_030": 0.050293, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000221, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000425, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000366, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000244, "vq_loss_layer_021": 0.000412, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000338, "vq_loss_layer_024": 0.000334, "vq_loss_layer_025": 0.0005, "vq_loss_layer_026": 0.000557, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000965, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.00351 }, { "ce_loss": 2.306604, "epoch": 0.01646, "grad_norm": 0.0012686424888670444, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.050012, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050433, "step": 16460, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024902, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000301, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.000197, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.260109, "epoch": 0.01647, "grad_norm": 0.001184940803796053, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.041992, "key_mse_loss_layer_005": 0.05542, "key_mse_loss_layer_006": 0.062012, "key_mse_loss_layer_007": 0.070801, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.082031, "key_mse_loss_layer_010": 0.09375, "key_mse_loss_layer_011": 0.092773, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.088379, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.088379, "key_mse_loss_layer_021": 0.083984, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.072754, "key_mse_loss_layer_031": 0.057861, "kv_mse_loss": 0.049847, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050272, "step": 16470, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.050537, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.00013, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000288, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000155, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000201, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.003784 }, { "ce_loss": 2.326318, "epoch": 0.01648, "grad_norm": 0.0011164478491991758, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049554, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.049966, "step": 16480, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.334692, "epoch": 0.01649, "grad_norm": 0.0012787122977897525, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049786, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050174, "step": 16490, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.00016, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.312792, "epoch": 0.0165, "grad_norm": 0.001383176539093256, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.049658, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050058, "step": 16500, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000359, "vq_loss_layer_026": 0.000515, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.001122, "vq_loss_layer_030": 0.002609, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.310887, "epoch": 0.01651, "grad_norm": 0.0010292426450178027, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049777, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050204, "step": 16510, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019531, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001717, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.321067, "epoch": 0.01652, "grad_norm": 0.0013699100818485022, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052002, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049496, "kv_vq_loss": 0.000377, "learning_rate": 0.001, "loss": 0.049875, "step": 16520, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.031982, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000167, "vq_loss_layer_012": 0.000284, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000246, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000155, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.00019, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000197, "vq_loss_layer_026": 0.000326, "vq_loss_layer_027": 0.000408, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.322919, "epoch": 0.01653, "grad_norm": 0.0011302267666906118, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.056152, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049756, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050208, "step": 16530, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000376, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.311377, "epoch": 0.01654, "grad_norm": 0.0011467353906482458, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.092773, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049866, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050262, "step": 16540, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014526, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.00017, "vq_loss_layer_010": 0.000152, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000239, "vq_loss_layer_014": 0.000298, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000252, "vq_loss_layer_018": 0.000222, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000153, "vq_loss_layer_021": 0.000248, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.337827, "epoch": 0.01655, "grad_norm": 0.0011290117399767041, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.049753, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050153, "step": 16550, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000218, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001106, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.305142, "epoch": 0.01656, "grad_norm": 0.001073853694833815, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050189, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050626, "step": 16560, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000203, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000378, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002289 }, { "ce_loss": 2.310968, "epoch": 0.01657, "grad_norm": 0.0010281268041580915, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049741, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050162, "step": 16570, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000135, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.336371, "epoch": 0.01658, "grad_norm": 0.0010947618866339326, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049417, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.049826, "step": 16580, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004333, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.293799, "epoch": 0.01659, "grad_norm": 0.0012160323094576597, "key_mse_loss_layer_000": 0.003845, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.12207, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.105469, "key_mse_loss_layer_018": 0.113281, "key_mse_loss_layer_019": 0.094238, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049857, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050266, "step": 16590, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001152, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.000241, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.306304, "epoch": 0.0166, "grad_norm": 0.001164231332950294, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.091797, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.145508, "key_mse_loss_layer_014": 0.141602, "key_mse_loss_layer_015": 0.129883, "key_mse_loss_layer_016": 0.128906, "key_mse_loss_layer_017": 0.12793, "key_mse_loss_layer_018": 0.134766, "key_mse_loss_layer_019": 0.105469, "key_mse_loss_layer_020": 0.125, "key_mse_loss_layer_021": 0.116211, "key_mse_loss_layer_022": 0.125977, "key_mse_loss_layer_023": 0.125977, "key_mse_loss_layer_024": 0.100098, "key_mse_loss_layer_025": 0.091797, "key_mse_loss_layer_026": 0.105957, "key_mse_loss_layer_027": 0.102051, "key_mse_loss_layer_028": 0.111328, "key_mse_loss_layer_029": 0.095215, "key_mse_loss_layer_030": 0.114258, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.050049, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050464, "step": 16600, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.016602, "value_mse_loss_layer_016": 0.013489, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.021606, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.02771, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.00032, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000418, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.292344, "epoch": 0.01661, "grad_norm": 0.00117974984459579, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.05, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050409, "step": 16610, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000149, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000159, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000175, "vq_loss_layer_023": 0.000197, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000214, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000568, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.318609, "epoch": 0.01662, "grad_norm": 0.0012674200115725398, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.077637, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.050018, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050427, "step": 16620, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036865, "value_mse_loss_layer_026": 0.032959, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047607, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000237, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.327575, "epoch": 0.01663, "grad_norm": 0.001076668268069625, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049487, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049896, "step": 16630, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000139, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000698, "vq_loss_layer_030": 0.001556, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.302415, "epoch": 0.01664, "grad_norm": 0.001077511114999652, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.082031, "key_mse_loss_layer_010": 0.094238, "key_mse_loss_layer_011": 0.093262, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.10498, "key_mse_loss_layer_014": 0.102051, "key_mse_loss_layer_015": 0.093262, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.102539, "key_mse_loss_layer_024": 0.086426, "key_mse_loss_layer_025": 0.083496, "key_mse_loss_layer_026": 0.092285, "key_mse_loss_layer_027": 0.097656, "key_mse_loss_layer_028": 0.101074, "key_mse_loss_layer_029": 0.101074, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049857, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050269, "step": 16640, "value_mse_loss_layer_000": 0.000412, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011658, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.019897, "value_mse_loss_layer_016": 0.016235, "value_mse_loss_layer_017": 0.021606, "value_mse_loss_layer_018": 0.017578, "value_mse_loss_layer_019": 0.021484, "value_mse_loss_layer_020": 0.023682, "value_mse_loss_layer_021": 0.026733, "value_mse_loss_layer_022": 0.027954, "value_mse_loss_layer_023": 0.032471, "value_mse_loss_layer_024": 0.041504, "value_mse_loss_layer_025": 0.044189, "value_mse_loss_layer_026": 0.044434, "value_mse_loss_layer_027": 0.054199, "value_mse_loss_layer_028": 0.062988, "value_mse_loss_layer_029": 0.074219, "value_mse_loss_layer_030": 0.077148, "value_mse_loss_layer_031": 0.05957, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000186, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000171, "vq_loss_layer_012": 0.000278, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000282, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000269, "vq_loss_layer_017": 0.000256, "vq_loss_layer_018": 0.000143, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000135, "vq_loss_layer_021": 0.000248, "vq_loss_layer_022": 0.000187, "vq_loss_layer_023": 0.000196, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.001167, "vq_loss_layer_029": 0.00132, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.313532, "epoch": 0.01665, "grad_norm": 0.0013079531490802765, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.04993, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050345, "step": 16650, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000698, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001801, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.302217, "epoch": 0.01666, "grad_norm": 0.0010795560665428638, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.04126, "key_mse_loss_layer_005": 0.055176, "key_mse_loss_layer_006": 0.062012, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.081543, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.070801, "key_mse_loss_layer_031": 0.05542, "kv_mse_loss": 0.049817, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050259, "step": 16660, "value_mse_loss_layer_000": 0.000416, "value_mse_loss_layer_001": 0.001137, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.015198, "value_mse_loss_layer_013": 0.01709, "value_mse_loss_layer_014": 0.0177, "value_mse_loss_layer_015": 0.020142, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.019653, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.024292, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.049072, "value_mse_loss_layer_029": 0.057129, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000209, "vq_loss_layer_011": 0.000207, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000341, "vq_loss_layer_014": 0.000423, "vq_loss_layer_015": 0.00045, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000301, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000372, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.001053, "vq_loss_layer_029": 0.000954, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.295685, "epoch": 0.01667, "grad_norm": 0.0010596592910587788, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.050171, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050568, "step": 16670, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.025391, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.00041, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.309244, "epoch": 0.01668, "grad_norm": 0.0010993543546646833, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049579, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.049957, "step": 16680, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.014954, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000381, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000269, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.378941, "epoch": 0.01669, "grad_norm": 0.0011701683979481459, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049509, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049915, "step": 16690, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000319, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000225, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000263, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.304432, "epoch": 0.0167, "grad_norm": 0.001031329040415585, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049728, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050113, "step": 16700, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000385, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000679, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.328572, "epoch": 0.01671, "grad_norm": 0.00132092519197613, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.049454, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.049838, "step": 16710, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014709, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.01532, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000207, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001244, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.003769 }, { "ce_loss": 2.337832, "epoch": 0.01672, "grad_norm": 0.0010047182440757751, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049719, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050107, "step": 16720, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.315127, "epoch": 0.01673, "grad_norm": 0.0011993118096143007, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.068359, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.049484, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049866, "step": 16730, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.00017, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000246, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000395, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000587, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001297, "vq_loss_layer_030": 0.002655, "vq_loss_layer_031": 0.003937 }, { "ce_loss": 2.350392, "epoch": 0.01674, "grad_norm": 0.0011643883772194386, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.049591, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049976, "step": 16740, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000378, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000748, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001923, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.344736, "epoch": 0.01675, "grad_norm": 0.0011001736856997013, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.08252, "kv_mse_loss": 0.049689, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050079, "step": 16750, "value_mse_loss_layer_000": 0.000406, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.00061, "vq_loss_layer_028": 0.000923, "vq_loss_layer_029": 0.001595, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.307565, "epoch": 0.01676, "grad_norm": 0.0012112429831176996, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049985, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050418, "step": 16760, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 4.7e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000135, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.301245, "epoch": 0.01677, "grad_norm": 0.001081241643987596, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.132812, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.115723, "key_mse_loss_layer_016": 0.110352, "key_mse_loss_layer_017": 0.109863, "key_mse_loss_layer_018": 0.115723, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.100098, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.097168, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049692, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050076, "step": 16770, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013794, "value_mse_loss_layer_017": 0.01709, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021362, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.028564, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000399, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.287195, "epoch": 0.01678, "grad_norm": 0.0011730141704902053, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.094238, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049557, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.049954, "step": 16780, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014771, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015503, "value_mse_loss_layer_017": 0.019531, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.024536, "value_mse_loss_layer_022": 0.027222, "value_mse_loss_layer_023": 0.029785, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.040527, "value_mse_loss_layer_026": 0.037842, "value_mse_loss_layer_027": 0.048096, "value_mse_loss_layer_028": 0.054932, "value_mse_loss_layer_029": 0.063477, "value_mse_loss_layer_030": 0.066895, "value_mse_loss_layer_031": 0.054199, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000296, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000197, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.001259, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.31459, "epoch": 0.01679, "grad_norm": 0.001281380420550704, "key_mse_loss_layer_000": 0.004639, "key_mse_loss_layer_001": 0.011658, "key_mse_loss_layer_002": 0.060791, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.091309, "key_mse_loss_layer_027": 0.096191, "key_mse_loss_layer_028": 0.098145, "key_mse_loss_layer_029": 0.095215, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.04978, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050192, "step": 16790, "value_mse_loss_layer_000": 0.000429, "value_mse_loss_layer_001": 0.001183, "value_mse_loss_layer_002": 0.004364, "value_mse_loss_layer_003": 0.00824, "value_mse_loss_layer_004": 0.007874, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.00885, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014465, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019653, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.029663, "value_mse_loss_layer_024": 0.037109, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.037598, "value_mse_loss_layer_027": 0.049072, "value_mse_loss_layer_028": 0.052979, "value_mse_loss_layer_029": 0.063965, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000125, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000207, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000237, "vq_loss_layer_019": 0.000211, "vq_loss_layer_020": 0.000261, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000565, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000618, "vq_loss_layer_027": 0.000767, "vq_loss_layer_028": 0.001251, "vq_loss_layer_029": 0.001419, "vq_loss_layer_030": 0.003006, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.342651, "epoch": 0.0168, "grad_norm": 0.0011317930184304714, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07959, "key_mse_loss_layer_008": 0.09082, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.109863, "key_mse_loss_layer_011": 0.105957, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.136719, "key_mse_loss_layer_014": 0.130859, "key_mse_loss_layer_015": 0.121582, "key_mse_loss_layer_016": 0.11377, "key_mse_loss_layer_017": 0.11377, "key_mse_loss_layer_018": 0.118652, "key_mse_loss_layer_019": 0.09668, "key_mse_loss_layer_020": 0.111328, "key_mse_loss_layer_021": 0.105469, "key_mse_loss_layer_022": 0.109375, "key_mse_loss_layer_023": 0.105957, "key_mse_loss_layer_024": 0.08252, "key_mse_loss_layer_025": 0.078125, "key_mse_loss_layer_026": 0.092285, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049573, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.049976, "step": 16800, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.00766, "value_mse_loss_layer_005": 0.00705, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013611, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.02124, "value_mse_loss_layer_023": 0.023682, "value_mse_loss_layer_024": 0.026245, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.028809, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.046631, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000171, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.00022, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.00021, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000246, "vq_loss_layer_021": 0.000395, "vq_loss_layer_022": 0.000275, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000423, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.00383 }, { "ce_loss": 2.323301, "epoch": 0.01681, "grad_norm": 0.0010167976142838597, "key_mse_loss_layer_000": 0.003616, "key_mse_loss_layer_001": 0.01123, "key_mse_loss_layer_002": 0.064941, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.10498, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.098633, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.078613, "key_mse_loss_layer_026": 0.090332, "key_mse_loss_layer_027": 0.095215, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.092285, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.04968, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050076, "step": 16810, "value_mse_loss_layer_000": 0.000359, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014404, "value_mse_loss_layer_010": 0.011475, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014343, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.021484, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.026001, "value_mse_loss_layer_023": 0.029419, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.036621, "value_mse_loss_layer_027": 0.04834, "value_mse_loss_layer_028": 0.052002, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.067871, "value_mse_loss_layer_031": 0.052979, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9e-05, "vq_loss_layer_007": 0.000131, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.00016, "vq_loss_layer_012": 0.000278, "vq_loss_layer_013": 0.000242, "vq_loss_layer_014": 0.000307, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000232, "vq_loss_layer_018": 0.000278, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000152, "vq_loss_layer_021": 0.000239, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.001038, "vq_loss_layer_029": 0.001274, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.004486 }, { "ce_loss": 2.287226, "epoch": 0.01682, "grad_norm": 0.0011605409672483802, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049973, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050391, "step": 16820, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.028076, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042969, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.05542, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000177, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.320431, "epoch": 0.01683, "grad_norm": 0.0009626311366446316, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.101562, "key_mse_loss_layer_017": 0.103027, "key_mse_loss_layer_018": 0.109375, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049792, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050177, "step": 16830, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.02478, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000256, "vq_loss_layer_010": 0.000224, "vq_loss_layer_011": 0.000213, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000441, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000629, "vq_loss_layer_028": 0.000854, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.003525 }, { "ce_loss": 2.282744, "epoch": 0.01684, "grad_norm": 0.0010843176860362291, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049969, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050357, "step": 16840, "value_mse_loss_layer_000": 0.000412, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.281932, "epoch": 0.01685, "grad_norm": 0.0011459537781774998, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.049884, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050293, "step": 16850, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006104, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000378, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.002487 }, { "ce_loss": 2.30283, "epoch": 0.01686, "grad_norm": 0.0010205532889813185, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049802, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050217, "step": 16860, "value_mse_loss_layer_000": 0.00041, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000347, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000224, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000486, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.29906, "epoch": 0.01687, "grad_norm": 0.0011573105584830046, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.051514, "key_mse_loss_layer_003": 0.044189, "key_mse_loss_layer_004": 0.039795, "key_mse_loss_layer_005": 0.052979, "key_mse_loss_layer_006": 0.060059, "key_mse_loss_layer_007": 0.069336, "key_mse_loss_layer_008": 0.078613, "key_mse_loss_layer_009": 0.08252, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.093262, "key_mse_loss_layer_012": 0.067871, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.094238, "key_mse_loss_layer_019": 0.080566, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.084961, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.072754, "key_mse_loss_layer_031": 0.057617, "kv_mse_loss": 0.04978, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050235, "step": 16870, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 3.9e-05, "vq_loss_layer_005": 4.4e-05, "vq_loss_layer_006": 8.4e-05, "vq_loss_layer_007": 0.000129, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000134, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000189, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003006 }, { "ce_loss": 2.268866, "epoch": 0.01688, "grad_norm": 0.0010132156312465668, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.050272, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050687, "step": 16880, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000343, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000183, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.00032, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.321746, "epoch": 0.01689, "grad_norm": 0.001081637805327773, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049609, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050018, "step": 16890, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019409, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.00034, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000192, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001633, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.310146, "epoch": 0.0169, "grad_norm": 0.001033324166201055, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.050079, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050516, "step": 16900, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000469, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.333404, "epoch": 0.01691, "grad_norm": 0.0010580753441900015, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049887, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050278, "step": 16910, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013489, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016235, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000463, "vq_loss_layer_018": 0.000213, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000423, "vq_loss_layer_028": 0.000584, "vq_loss_layer_029": 0.000717, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002426 }, { "ce_loss": 2.304761, "epoch": 0.01692, "grad_norm": 0.001186296227388084, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.060791, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.049939, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050378, "step": 16920, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000137, "vq_loss_layer_009": 0.000229, "vq_loss_layer_010": 0.000149, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000284, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000246, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000198, "vq_loss_layer_024": 0.00018, "vq_loss_layer_025": 0.000224, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000387, "vq_loss_layer_028": 0.000587, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001511, "vq_loss_layer_031": 0.002197 }, { "ce_loss": 2.28574, "epoch": 0.01693, "grad_norm": 0.0010647047311067581, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.050116, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050525, "step": 16930, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000224, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000584, "vq_loss_layer_029": 0.000744, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.291397, "epoch": 0.01694, "grad_norm": 0.0012661003274843097, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.041504, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.139648, "key_mse_loss_layer_014": 0.132812, "key_mse_loss_layer_015": 0.120117, "key_mse_loss_layer_016": 0.114746, "key_mse_loss_layer_017": 0.114746, "key_mse_loss_layer_018": 0.12207, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.108887, "key_mse_loss_layer_021": 0.103516, "key_mse_loss_layer_022": 0.109863, "key_mse_loss_layer_023": 0.105957, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.091797, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049887, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050281, "step": 16940, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.023682, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.02832, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.041016, "value_mse_loss_layer_029": 0.047363, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000139, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000315, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003647 }, { "ce_loss": 2.29021, "epoch": 0.01695, "grad_norm": 0.001068550394847989, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049811, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050241, "step": 16950, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000167, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000214, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.283392, "epoch": 0.01696, "grad_norm": 0.0010000838665291667, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.049881, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050299, "step": 16960, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.00021, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.286603, "epoch": 0.01697, "grad_norm": 0.0010679666884243488, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.044434, "key_mse_loss_layer_004": 0.041504, "key_mse_loss_layer_005": 0.05542, "key_mse_loss_layer_006": 0.062012, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.105957, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.087402, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049985, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.0504, "step": 16970, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014221, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014526, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.013489, "value_mse_loss_layer_017": 0.016724, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.01709, "value_mse_loss_layer_020": 0.018921, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.021362, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000239, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000254, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.308332, "epoch": 0.01698, "grad_norm": 0.001073899446055293, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049582, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049988, "step": 16980, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.299818, "epoch": 0.01699, "grad_norm": 0.001211589085869491, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050195, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.05062, "step": 16990, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000242, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000319, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001549, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.285968, "epoch": 0.017, "grad_norm": 0.0010066539980471134, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.060303, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.07959, "key_mse_loss_layer_025": 0.075684, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.050204, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050583, "step": 17000, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.00386 }, { "ce_loss": 2.304563, "epoch": 0.01701, "grad_norm": 0.0009988361271098256, "key_mse_loss_layer_000": 0.003296, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049832, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050223, "step": 17010, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019897, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000177, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000227, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000599, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.002365 }, { "ce_loss": 2.290173, "epoch": 0.01702, "grad_norm": 0.0011668975930660963, "key_mse_loss_layer_000": 0.003799, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.099121, "key_mse_loss_layer_031": 0.089355, "kv_mse_loss": 0.049936, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.05033, "step": 17020, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000292, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000713, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.00161, "vq_loss_layer_030": 0.00267, "vq_loss_layer_031": 0.003555 }, { "ce_loss": 2.280095, "epoch": 0.01703, "grad_norm": 0.0011640924494713545, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.050317, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050717, "step": 17030, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014709, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.00017, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000243, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.311381, "epoch": 0.01704, "grad_norm": 0.0011585344327613711, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049924, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050348, "step": 17040, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000973, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.301416, "epoch": 0.01705, "grad_norm": 0.0010703338775783777, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049649, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050043, "step": 17050, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.00164, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.318688, "epoch": 0.01706, "grad_norm": 0.0011815045727416873, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049722, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.050113, "step": 17060, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000404, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000675, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002808 }, { "ce_loss": 2.291262, "epoch": 0.01707, "grad_norm": 0.0011809758143499494, "key_mse_loss_layer_000": 0.003876, "key_mse_loss_layer_001": 0.011536, "key_mse_loss_layer_002": 0.061768, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.077148, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.093262, "key_mse_loss_layer_028": 0.095703, "key_mse_loss_layer_029": 0.093262, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050128, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050555, "step": 17070, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.001129, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014343, "value_mse_loss_layer_010": 0.011414, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.033203, "value_mse_loss_layer_025": 0.038818, "value_mse_loss_layer_026": 0.036865, "value_mse_loss_layer_027": 0.047852, "value_mse_loss_layer_028": 0.054199, "value_mse_loss_layer_029": 0.061523, "value_mse_loss_layer_030": 0.068848, "value_mse_loss_layer_031": 0.053955, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 8.9e-05, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000224, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000205, "vq_loss_layer_024": 0.000242, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000622, "vq_loss_layer_028": 0.00116, "vq_loss_layer_029": 0.001465, "vq_loss_layer_030": 0.002548, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.352803, "epoch": 0.01708, "grad_norm": 0.0009961280738934875, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.071777, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.07666, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049152, "kv_vq_loss": 0.000372, "learning_rate": 0.001, "loss": 0.049509, "step": 17080, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012817, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016846, "value_mse_loss_layer_014": 0.017212, "value_mse_loss_layer_015": 0.019653, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.02002, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000422, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000372, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000526, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.328387, "epoch": 0.01709, "grad_norm": 0.0012853625230491161, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.049814, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050232, "step": 17090, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011353, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.299267, "epoch": 0.0171, "grad_norm": 0.0011580931022763252, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.078125, "kv_mse_loss": 0.049771, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050183, "step": 17100, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027954, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000259, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000244, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.318729, "epoch": 0.01711, "grad_norm": 0.0011590811191126704, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.049747, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050143, "step": 17110, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.295577, "epoch": 0.01712, "grad_norm": 0.0010802868055179715, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050269, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050681, "step": 17120, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.01355, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.016479, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.019287, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000423, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000173, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.296933, "epoch": 0.01713, "grad_norm": 0.001244750339537859, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.04968, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050098, "step": 17130, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000307, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000237, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000717, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.002411 }, { "ce_loss": 2.337911, "epoch": 0.01714, "grad_norm": 0.0010114316828548908, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.083496, "kv_mse_loss": 0.04968, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050082, "step": 17140, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000311, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001198, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.320663, "epoch": 0.01715, "grad_norm": 0.0011043624253943563, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049777, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050162, "step": 17150, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021851, "value_mse_loss_layer_023": 0.024292, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000313, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000874, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.331443, "epoch": 0.01716, "grad_norm": 0.0011033234186470509, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049603, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.049991, "step": 17160, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000252, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000223, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000309, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000938, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.334209, "epoch": 0.01717, "grad_norm": 0.001205272739753127, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049872, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050272, "step": 17170, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000229, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.001831, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.299388, "epoch": 0.01718, "grad_norm": 0.001102935872040689, "key_mse_loss_layer_000": 0.003693, "key_mse_loss_layer_001": 0.010925, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.044922, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049701, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050098, "step": 17180, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.029175, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.050049, "value_mse_loss_layer_029": 0.058594, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.05249, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000241, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000147, "vq_loss_layer_021": 0.000238, "vq_loss_layer_022": 0.000184, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000484, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.001167, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.004028 }, { "ce_loss": 2.335106, "epoch": 0.01719, "grad_norm": 0.0012394924415275455, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.049777, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050192, "step": 17190, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.00036, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000584, "vq_loss_layer_028": 0.000854, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003571 }, { "ce_loss": 2.282595, "epoch": 0.0172, "grad_norm": 0.0012038357090204954, "key_mse_loss_layer_000": 0.004089, "key_mse_loss_layer_001": 0.010864, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.081055, "key_mse_loss_layer_020": 0.086914, "key_mse_loss_layer_021": 0.084961, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.061768, "kv_mse_loss": 0.050226, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050644, "step": 17200, "value_mse_loss_layer_000": 0.000423, "value_mse_loss_layer_001": 0.001144, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.016968, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.02124, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000155, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000192, "vq_loss_layer_023": 0.000204, "vq_loss_layer_024": 0.000317, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.00106, "vq_loss_layer_030": 0.002319, "vq_loss_layer_031": 0.004395 }, { "ce_loss": 2.329132, "epoch": 0.01721, "grad_norm": 0.0011144292075186968, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049512, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.049902, "step": 17210, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000162, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.0002, "vq_loss_layer_025": 0.00023, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000372, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000633, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.002243 }, { "ce_loss": 2.303188, "epoch": 0.01722, "grad_norm": 0.0012237102491781116, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.053711, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.080566, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.077148, "kv_mse_loss": 0.049881, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050296, "step": 17220, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.021606, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.025024, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000215, "vq_loss_layer_024": 0.000199, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.307076, "epoch": 0.01723, "grad_norm": 0.00103294407017529, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.093262, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.049661, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050058, "step": 17230, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.01709, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000241, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.00032, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000246, "vq_loss_layer_018": 0.000159, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000162, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.267873, "epoch": 0.01724, "grad_norm": 0.0011279728496447206, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049515, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.049911, "step": 17240, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000136, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000212, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000322, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.320218, "epoch": 0.01725, "grad_norm": 0.0012064115144312382, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.075195, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.04986, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050293, "step": 17250, "value_mse_loss_layer_000": 0.000418, "value_mse_loss_layer_001": 0.001114, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.049805, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000385, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000475, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.003616 }, { "ce_loss": 2.337324, "epoch": 0.01726, "grad_norm": 0.0010246969759464264, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.061035, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.049557, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.049945, "step": 17260, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006104, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000132, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000191, "vq_loss_layer_025": 0.000226, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.00038, "vq_loss_layer_028": 0.000568, "vq_loss_layer_029": 0.000656, "vq_loss_layer_030": 0.001511, "vq_loss_layer_031": 0.002243 }, { "ce_loss": 2.286681, "epoch": 0.01727, "grad_norm": 0.0011728755198419094, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.01178, "key_mse_loss_layer_002": 0.064941, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.091309, "key_mse_loss_layer_009": 0.095215, "key_mse_loss_layer_010": 0.107422, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.114746, "key_mse_loss_layer_016": 0.113281, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.120605, "key_mse_loss_layer_019": 0.099609, "key_mse_loss_layer_020": 0.111816, "key_mse_loss_layer_021": 0.103516, "key_mse_loss_layer_022": 0.114746, "key_mse_loss_layer_023": 0.116699, "key_mse_loss_layer_024": 0.09668, "key_mse_loss_layer_025": 0.091797, "key_mse_loss_layer_026": 0.108398, "key_mse_loss_layer_027": 0.117188, "key_mse_loss_layer_028": 0.118164, "key_mse_loss_layer_029": 0.115723, "key_mse_loss_layer_030": 0.126953, "key_mse_loss_layer_031": 0.099121, "kv_mse_loss": 0.049887, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050314, "step": 17270, "value_mse_loss_layer_000": 0.00036, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010437, "value_mse_loss_layer_009": 0.013367, "value_mse_loss_layer_010": 0.011353, "value_mse_loss_layer_011": 0.011719, "value_mse_loss_layer_012": 0.012085, "value_mse_loss_layer_013": 0.013123, "value_mse_loss_layer_014": 0.014038, "value_mse_loss_layer_015": 0.014404, "value_mse_loss_layer_016": 0.01239, "value_mse_loss_layer_017": 0.016357, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.046387, "value_mse_loss_layer_028": 0.049316, "value_mse_loss_layer_029": 0.058838, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.057129, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.000125, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000162, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000265, "vq_loss_layer_013": 0.000203, "vq_loss_layer_014": 0.000299, "vq_loss_layer_015": 0.000273, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.00024, "vq_loss_layer_018": 0.000259, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.000248, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000404, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.001289, "vq_loss_layer_029": 0.001793, "vq_loss_layer_030": 0.003601, "vq_loss_layer_031": 0.005585 }, { "ce_loss": 2.324681, "epoch": 0.01728, "grad_norm": 0.0009576125303283334, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049844, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050244, "step": 17280, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000211, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000199, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000328, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.305515, "epoch": 0.01729, "grad_norm": 0.0012045929906889796, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.056152, "key_mse_loss_layer_006": 0.0625, "key_mse_loss_layer_007": 0.070801, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.074219, "key_mse_loss_layer_031": 0.057617, "kv_mse_loss": 0.04985, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050247, "step": 17290, "value_mse_loss_layer_000": 0.000404, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004242, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000136, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000366, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000211, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.00025, "vq_loss_layer_021": 0.000463, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000399, "vq_loss_layer_025": 0.000414, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.316762, "epoch": 0.0173, "grad_norm": 0.0010664042783901095, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049677, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050061, "step": 17300, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000214, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.00025, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000437, "vq_loss_layer_028": 0.000679, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001556, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.298441, "epoch": 0.01731, "grad_norm": 0.0012338816886767745, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049347, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049762, "step": 17310, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000241, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.275563, "epoch": 0.01732, "grad_norm": 0.0012419362319633365, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.04967, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050092, "step": 17320, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000418, "vq_loss_layer_028": 0.000599, "vq_loss_layer_029": 0.00069, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.002289 }, { "ce_loss": 2.307844, "epoch": 0.01733, "grad_norm": 0.001202480518259108, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049689, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050076, "step": 17330, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027832, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000257, "vq_loss_layer_022": 0.000199, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000185, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.344426, "epoch": 0.01734, "grad_norm": 0.0010656930971890688, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.049594, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049982, "step": 17340, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000135, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.000174, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000328, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.00238, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.269508, "epoch": 0.01735, "grad_norm": 0.0012432936346158385, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049936, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050357, "step": 17350, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.294218, "epoch": 0.01736, "grad_norm": 0.001031666761264205, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.04975, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050168, "step": 17360, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000418, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000273, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.327709, "epoch": 0.01737, "grad_norm": 0.0011310286354273558, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077148, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049738, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.05014, "step": 17370, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001732, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.356215, "epoch": 0.01738, "grad_norm": 0.0012608567485585809, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.09082, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.049164, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.049582, "step": 17380, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 6.1e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000171, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000173, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000155, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.00145, "vq_loss_layer_030": 0.002579, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.296564, "epoch": 0.01739, "grad_norm": 0.0009513821569271386, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049469, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049866, "step": 17390, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013245, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000131, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000328, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000248, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000267, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000209, "vq_loss_layer_024": 0.000201, "vq_loss_layer_025": 0.000233, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.002335 }, { "ce_loss": 2.306928, "epoch": 0.0174, "grad_norm": 0.001203213701955974, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049438, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049823, "step": 17400, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000225, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000296, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001686, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.266984, "epoch": 0.01741, "grad_norm": 0.0011020554229617119, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049921, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050339, "step": 17410, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.332573, "epoch": 0.01742, "grad_norm": 0.0011656059650704265, "key_mse_loss_layer_000": 0.005066, "key_mse_loss_layer_001": 0.011475, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.049854, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050275, "step": 17420, "value_mse_loss_layer_000": 0.00042, "value_mse_loss_layer_001": 0.001106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000538, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003006 }, { "ce_loss": 2.336303, "epoch": 0.01743, "grad_norm": 0.001082407427020371, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049713, "kv_vq_loss": 0.00038, "learning_rate": 0.001, "loss": 0.050116, "step": 17430, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000725, "vq_loss_layer_030": 0.001602, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.295059, "epoch": 0.01744, "grad_norm": 0.0010495503665879369, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049655, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050043, "step": 17440, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000412, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000679, "vq_loss_layer_030": 0.001595, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.226881, "epoch": 0.01745, "grad_norm": 0.001097117899917066, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050012, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050415, "step": 17450, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000229, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000399, "vq_loss_layer_028": 0.000614, "vq_loss_layer_029": 0.000706, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.301045, "epoch": 0.01746, "grad_norm": 0.0010617467341944575, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049448, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.04986, "step": 17460, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.024048, "value_mse_loss_layer_022": 0.025635, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.043457, "value_mse_loss_layer_028": 0.048828, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000219, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000214, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.300778, "epoch": 0.01747, "grad_norm": 0.0013354226248338819, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.050385, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.050806, "step": 17470, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.055664, "value_mse_loss_layer_030": 0.062988, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.00018, "vq_loss_layer_025": 0.000231, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000399, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.296472, "epoch": 0.01748, "grad_norm": 0.0011753900907933712, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.049969, "kv_vq_loss": 0.000408, "learning_rate": 0.001, "loss": 0.050394, "step": 17480, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000135, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000158, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.000189, "vq_loss_layer_025": 0.000237, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000736, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.310874, "epoch": 0.01749, "grad_norm": 0.0011381349759176373, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049686, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050073, "step": 17490, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013245, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000315, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.316886, "epoch": 0.0175, "grad_norm": 0.00115735805593431, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049371, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.049796, "step": 17500, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.00708, "value_mse_loss_layer_004": 0.00647, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000378, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.00066, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.00235 }, { "ce_loss": 2.343917, "epoch": 0.01751, "grad_norm": 0.0011489434400573373, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049554, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049942, "step": 17510, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.052246, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 3.9e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.001442, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.298429, "epoch": 0.01752, "grad_norm": 0.001193939708173275, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.072754, "key_mse_loss_layer_031": 0.060791, "kv_mse_loss": 0.049628, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.05004, "step": 17520, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000284, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000197, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.299664, "epoch": 0.01753, "grad_norm": 0.001169932191260159, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049954, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050348, "step": 17530, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000137, "vq_loss_layer_011": 0.000167, "vq_loss_layer_012": 0.00028, "vq_loss_layer_013": 0.000232, "vq_loss_layer_014": 0.000286, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.0002, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000239, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.25214, "epoch": 0.01754, "grad_norm": 0.0009844520827755332, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049915, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050299, "step": 17540, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000173, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001572, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.31894, "epoch": 0.01755, "grad_norm": 0.0012225891696289182, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.07959, "kv_mse_loss": 0.049792, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050217, "step": 17550, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.045166, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056152, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.00017, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000234, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000687, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000977, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003387 }, { "ce_loss": 2.318104, "epoch": 0.01756, "grad_norm": 0.0010484177619218826, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.044434, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.091309, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.109375, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.079102, "key_mse_loss_layer_013": 0.140625, "key_mse_loss_layer_014": 0.138672, "key_mse_loss_layer_015": 0.123047, "key_mse_loss_layer_016": 0.117676, "key_mse_loss_layer_017": 0.118652, "key_mse_loss_layer_018": 0.124512, "key_mse_loss_layer_019": 0.097168, "key_mse_loss_layer_020": 0.115723, "key_mse_loss_layer_021": 0.10791, "key_mse_loss_layer_022": 0.113281, "key_mse_loss_layer_023": 0.108887, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.081055, "key_mse_loss_layer_026": 0.095703, "key_mse_loss_layer_027": 0.091309, "key_mse_loss_layer_028": 0.100098, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049704, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050089, "step": 17560, "value_mse_loss_layer_000": 0.000408, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.013306, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.01709, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021606, "value_mse_loss_layer_023": 0.023193, "value_mse_loss_layer_024": 0.025513, "value_mse_loss_layer_025": 0.03125, "value_mse_loss_layer_026": 0.027954, "value_mse_loss_layer_027": 0.0354, "value_mse_loss_layer_028": 0.039062, "value_mse_loss_layer_029": 0.046143, "value_mse_loss_layer_030": 0.050537, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.000225, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000446, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000257, "vq_loss_layer_021": 0.000435, "vq_loss_layer_022": 0.000381, "vq_loss_layer_023": 0.000404, "vq_loss_layer_024": 0.000412, "vq_loss_layer_025": 0.000633, "vq_loss_layer_026": 0.000641, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000946, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.254644, "epoch": 0.01757, "grad_norm": 0.0011536668753251433, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049524, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049908, "step": 17570, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.324019, "epoch": 0.01758, "grad_norm": 0.0010555572807788849, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049582, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049969, "step": 17580, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00033, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000431, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.00069, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.322056, "epoch": 0.01759, "grad_norm": 0.0011152282822877169, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049506, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049905, "step": 17590, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000359, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000267, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000299, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.347556, "epoch": 0.0176, "grad_norm": 0.0010973084717988968, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.047607, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049124, "kv_vq_loss": 0.000377, "learning_rate": 0.001, "loss": 0.0495, "step": 17600, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012207, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015198, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000839, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.347399, "epoch": 0.01761, "grad_norm": 0.0010596609208732843, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.05, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050406, "step": 17610, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001091, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000153, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.340131, "epoch": 0.01762, "grad_norm": 0.0011252593249082565, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.083496, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.093262, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.105957, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.092285, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049438, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.049841, "step": 17620, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012146, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02478, "value_mse_loss_layer_023": 0.028198, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.047852, "value_mse_loss_layer_029": 0.055176, "value_mse_loss_layer_030": 0.060303, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000176, "vq_loss_layer_010": 0.000146, "vq_loss_layer_011": 0.000161, "vq_loss_layer_012": 0.000282, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000288, "vq_loss_layer_015": 0.000322, "vq_loss_layer_016": 0.00028, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000157, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000212, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.001678, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.285215, "epoch": 0.01763, "grad_norm": 0.0010972132440656424, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049945, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050348, "step": 17630, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000129, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000153, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000267, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000179, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000423, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.002426 }, { "ce_loss": 2.338817, "epoch": 0.01764, "grad_norm": 0.0010333077516406775, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.059082, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.104004, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.112793, "key_mse_loss_layer_016": 0.104492, "key_mse_loss_layer_017": 0.106934, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.101562, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.078125, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049734, "kv_vq_loss": 0.00038, "learning_rate": 0.001, "loss": 0.050107, "step": 17640, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000359, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000222, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000307, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000427, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001411, "vq_loss_layer_030": 0.002686, "vq_loss_layer_031": 0.003677 }, { "ce_loss": 2.292864, "epoch": 0.01765, "grad_norm": 0.00129161705262959, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.050067, "kv_vq_loss": 0.000414, "learning_rate": 0.001, "loss": 0.050504, "step": 17650, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015747, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.02124, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.031494, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.041748, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000243, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000328, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000378, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000223, "vq_loss_layer_020": 0.000231, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.327968, "epoch": 0.01766, "grad_norm": 0.001038610003888607, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.04964, "kv_vq_loss": 0.000409, "learning_rate": 0.001, "loss": 0.050095, "step": 17660, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.05957, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000201, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.00024, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.326158, "epoch": 0.01767, "grad_norm": 0.0010312108788639307, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049332, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.04971, "step": 17670, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.31782, "epoch": 0.01768, "grad_norm": 0.0012515862472355366, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.075684, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.049878, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050272, "step": 17680, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.01355, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.02417, "value_mse_loss_layer_024": 0.026489, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.027954, "value_mse_loss_layer_027": 0.035889, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.046143, "value_mse_loss_layer_030": 0.052002, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000241, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000334, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.00028, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.000486, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.314304, "epoch": 0.01769, "grad_norm": 0.001211165334098041, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.049561, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.049997, "step": 17690, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.014648, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021362, "value_mse_loss_layer_022": 0.021484, "value_mse_loss_layer_023": 0.024292, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.031982, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000992, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.282921, "epoch": 0.0177, "grad_norm": 0.0010704585583880544, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049463, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049854, "step": 17700, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000694, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.33396, "epoch": 0.01771, "grad_norm": 0.00107059464789927, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049438, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049826, "step": 17710, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 4.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000154, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.0002, "vq_loss_layer_025": 0.000244, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000706, "vq_loss_layer_030": 0.001472, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.360704, "epoch": 0.01772, "grad_norm": 0.0011173248058184981, "key_mse_loss_layer_000": 0.002853, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049969, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050378, "step": 17720, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016724, "value_mse_loss_layer_016": 0.013489, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017334, "value_mse_loss_layer_020": 0.019043, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000137, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000334, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000292, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000338, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000935, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003738 }, { "ce_loss": 2.263377, "epoch": 0.01773, "grad_norm": 0.0011811803560703993, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.04971, "kv_vq_loss": 0.000399, "learning_rate": 0.001, "loss": 0.050122, "step": 17730, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016846, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000139, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000148, "vq_loss_layer_011": 0.000174, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000233, "vq_loss_layer_014": 0.000292, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000154, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000203, "vq_loss_layer_024": 0.000192, "vq_loss_layer_025": 0.000224, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.00148, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.334868, "epoch": 0.01774, "grad_norm": 0.0012846726458519697, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049374, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049762, "step": 17740, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 2e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000345, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000154, "vq_loss_layer_021": 0.000256, "vq_loss_layer_022": 0.000192, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000214, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.002594, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.276437, "epoch": 0.01775, "grad_norm": 0.0010556826600804925, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049683, "kv_vq_loss": 0.000406, "learning_rate": 0.001, "loss": 0.05011, "step": 17750, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.376763, "epoch": 0.01776, "grad_norm": 0.001121237175539136, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049493, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049884, "step": 17760, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.061279, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000172, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000235, "vq_loss_layer_014": 0.000305, "vq_loss_layer_015": 0.000309, "vq_loss_layer_016": 0.000284, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000131, "vq_loss_layer_020": 0.000151, "vq_loss_layer_021": 0.000237, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000196, "vq_loss_layer_024": 0.000219, "vq_loss_layer_025": 0.000235, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000584, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.323505, "epoch": 0.01777, "grad_norm": 0.0011275687720626593, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049515, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049908, "step": 17770, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000191, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.000725, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.31314, "epoch": 0.01778, "grad_norm": 0.001050839084200561, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049695, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050089, "step": 17780, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000175, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000301, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000161, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000195, "vq_loss_layer_023": 0.000217, "vq_loss_layer_024": 0.000194, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.282524, "epoch": 0.01779, "grad_norm": 0.00111167854629457, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.050003, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.0504, "step": 17790, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.02478, "value_mse_loss_layer_024": 0.026978, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000224, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.000205, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000479, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.320772, "epoch": 0.0178, "grad_norm": 0.0011038576485589147, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049432, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.049814, "step": 17800, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.02771, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000133, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000173, "vq_loss_layer_010": 0.000151, "vq_loss_layer_011": 0.000163, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000305, "vq_loss_layer_015": 0.000315, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000158, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.00021, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000378, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.333126, "epoch": 0.01781, "grad_norm": 0.0011768582044169307, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.058838, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.10791, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049542, "kv_vq_loss": 0.00038, "learning_rate": 0.001, "loss": 0.049915, "step": 17810, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006042, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013184, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.000149, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.00015, "vq_loss_layer_019": 0.000132, "vq_loss_layer_020": 0.000154, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000173, "vq_loss_layer_023": 0.000193, "vq_loss_layer_024": 0.000185, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.306837, "epoch": 0.01782, "grad_norm": 0.0010813407134264708, "key_mse_loss_layer_000": 0.00264, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.124512, "key_mse_loss_layer_014": 0.122559, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.05957, "kv_mse_loss": 0.04964, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050052, "step": 17820, "value_mse_loss_layer_000": 0.000355, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007721, "value_mse_loss_layer_005": 0.006989, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009399, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.016113, "value_mse_loss_layer_010": 0.013123, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.015076, "value_mse_loss_layer_013": 0.017456, "value_mse_loss_layer_014": 0.017578, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.015259, "value_mse_loss_layer_017": 0.020264, "value_mse_loss_layer_018": 0.016724, "value_mse_loss_layer_019": 0.019287, "value_mse_loss_layer_020": 0.021851, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.4e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000237, "vq_loss_layer_010": 0.000227, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.000338, "vq_loss_layer_014": 0.000444, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000381, "vq_loss_layer_017": 0.000389, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000189, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000456, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000309, "vq_loss_layer_024": 0.00037, "vq_loss_layer_025": 0.000523, "vq_loss_layer_026": 0.000572, "vq_loss_layer_027": 0.00074, "vq_loss_layer_028": 0.00116, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.00412 }, { "ce_loss": 2.302786, "epoch": 0.01783, "grad_norm": 0.0011055886279791594, "key_mse_loss_layer_000": 0.004517, "key_mse_loss_layer_001": 0.011719, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.057129, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.095215, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.050146, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050555, "step": 17830, "value_mse_loss_layer_000": 0.000414, "value_mse_loss_layer_001": 0.001122, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.061523, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000261, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000196, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000242, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.000603, "vq_loss_layer_029": 0.001022, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.28032, "epoch": 0.01784, "grad_norm": 0.0012129832757636905, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.101074, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049753, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050174, "step": 17840, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000205, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000334, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000467, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.000904, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.339231, "epoch": 0.01785, "grad_norm": 0.001095412066206336, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049734, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.050116, "step": 17850, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006805, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046143, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000116, "vq_loss_layer_007": 0.000176, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.312466, "epoch": 0.01786, "grad_norm": 0.0011694771237671375, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049625, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050024, "step": 17860, "value_mse_loss_layer_000": 0.000389, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000177, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001755, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.312446, "epoch": 0.01787, "grad_norm": 0.0010311183286830783, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049579, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.049988, "step": 17870, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014709, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.013672, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.021851, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.279005, "epoch": 0.01788, "grad_norm": 0.001177698839455843, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.049481, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.049899, "step": 17880, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.00019, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.00079, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.002136, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.336878, "epoch": 0.01789, "grad_norm": 0.0014852238819003105, "key_mse_loss_layer_000": 0.003647, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049765, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050156, "step": 17890, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.050293, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.000456, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000961, "vq_loss_layer_030": 0.00235, "vq_loss_layer_031": 0.003372 }, { "ce_loss": 2.315694, "epoch": 0.0179, "grad_norm": 0.001221208367496729, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.056396, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049847, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050284, "step": 17900, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000139, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000357, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000282, "vq_loss_layer_022": 0.000228, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001595, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.309496, "epoch": 0.01791, "grad_norm": 0.001009470783174038, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.0495, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049878, "step": 17910, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014465, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000355, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.301256, "epoch": 0.01792, "grad_norm": 0.0011608087224885821, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.04989, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.05032, "step": 17920, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.001648, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.313161, "epoch": 0.01793, "grad_norm": 0.0011168221244588494, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049863, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050269, "step": 17930, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.009155, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.4e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000174, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000349, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000437, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003098 }, { "ce_loss": 2.250879, "epoch": 0.01794, "grad_norm": 0.001113930600695312, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049847, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050235, "step": 17940, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000227, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.00032, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.322172, "epoch": 0.01795, "grad_norm": 0.0012192163849249482, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049734, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050131, "step": 17950, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.2e-05, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000218, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.001823, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.302062, "epoch": 0.01796, "grad_norm": 0.0011146074393764138, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.106445, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.077148, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.114746, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.108887, "key_mse_loss_layer_018": 0.115234, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.103027, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.093262, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.050317, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050723, "step": 17960, "value_mse_loss_layer_000": 0.000395, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013672, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021484, "value_mse_loss_layer_022": 0.021606, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.028564, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.00032, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000793, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.250933, "epoch": 0.01797, "grad_norm": 0.000991961220279336, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049896, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050302, "step": 17970, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001633, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.332966, "epoch": 0.01798, "grad_norm": 0.0012377242092043161, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.095703, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.09375, "key_mse_loss_layer_016": 0.085449, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.08252, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.076172, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049448, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.049826, "step": 17980, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000139, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000157, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000383, "vq_loss_layer_027": 0.00041, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.255148, "epoch": 0.01799, "grad_norm": 0.0011070242617279291, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049609, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.049991, "step": 17990, "value_mse_loss_layer_000": 0.000393, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.31287, "epoch": 0.018, "grad_norm": 0.0012183339567855, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049838, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050238, "step": 18000, "value_mse_loss_layer_000": 0.000399, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.02771, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000219, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.000349, "vq_loss_layer_022": 0.00029, "vq_loss_layer_023": 0.000345, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000523, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.282738, "epoch": 0.01801, "grad_norm": 0.0010684888111427426, "key_mse_loss_layer_000": 0.003632, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.123535, "key_mse_loss_layer_014": 0.119629, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.050052, "kv_vq_loss": 0.000405, "learning_rate": 0.001, "loss": 0.050467, "step": 18010, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.008057, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014343, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016602, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000169, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000235, "vq_loss_layer_010": 0.000212, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.00034, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000357, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000277, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.000984, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.253151, "epoch": 0.01802, "grad_norm": 0.0012192766880616546, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.050015, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050439, "step": 18020, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000172, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.000345, "vq_loss_layer_022": 0.000214, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.289393, "epoch": 0.01803, "grad_norm": 0.001271416898816824, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049689, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.050122, "step": 18030, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007141, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000309, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.334732, "epoch": 0.01804, "grad_norm": 0.0011996753746643662, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.096191, "key_mse_loss_layer_009": 0.104492, "key_mse_loss_layer_010": 0.116211, "key_mse_loss_layer_011": 0.108887, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.146484, "key_mse_loss_layer_014": 0.143555, "key_mse_loss_layer_015": 0.131836, "key_mse_loss_layer_016": 0.125977, "key_mse_loss_layer_017": 0.123047, "key_mse_loss_layer_018": 0.129883, "key_mse_loss_layer_019": 0.101562, "key_mse_loss_layer_020": 0.119141, "key_mse_loss_layer_021": 0.111816, "key_mse_loss_layer_022": 0.117188, "key_mse_loss_layer_023": 0.114258, "key_mse_loss_layer_024": 0.090332, "key_mse_loss_layer_025": 0.08252, "key_mse_loss_layer_026": 0.101074, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.103516, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.103027, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.050049, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.05047, "step": 18040, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010498, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014465, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016235, "value_mse_loss_layer_016": 0.013062, "value_mse_loss_layer_017": 0.016968, "value_mse_loss_layer_018": 0.014282, "value_mse_loss_layer_019": 0.01709, "value_mse_loss_layer_020": 0.018799, "value_mse_loss_layer_021": 0.020264, "value_mse_loss_layer_022": 0.020142, "value_mse_loss_layer_023": 0.021362, "value_mse_loss_layer_024": 0.024536, "value_mse_loss_layer_025": 0.029419, "value_mse_loss_layer_026": 0.025269, "value_mse_loss_layer_027": 0.033691, "value_mse_loss_layer_028": 0.037109, "value_mse_loss_layer_029": 0.041748, "value_mse_loss_layer_030": 0.049316, "value_mse_loss_layer_031": 0.045166, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000197, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000231, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000209, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000391, "vq_loss_layer_022": 0.00034, "vq_loss_layer_023": 0.000357, "vq_loss_layer_024": 0.000435, "vq_loss_layer_025": 0.000584, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.298429, "epoch": 0.01805, "grad_norm": 0.0011438046349212527, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.131836, "key_mse_loss_layer_014": 0.128906, "key_mse_loss_layer_015": 0.115234, "key_mse_loss_layer_016": 0.109375, "key_mse_loss_layer_017": 0.108398, "key_mse_loss_layer_018": 0.116211, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.101562, "key_mse_loss_layer_022": 0.105469, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.081055, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.088379, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.087402, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049875, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050272, "step": 18050, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.016968, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.021851, "value_mse_loss_layer_023": 0.023926, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.029053, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.00013, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000368, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000338, "vq_loss_layer_025": 0.000412, "vq_loss_layer_026": 0.000565, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000908, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.322846, "epoch": 0.01806, "grad_norm": 0.0010780716547742486, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049738, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050137, "step": 18060, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000172, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000744, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002457, "vq_loss_layer_031": 0.003159 }, { "ce_loss": 2.319543, "epoch": 0.01807, "grad_norm": 0.0012323982082307339, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.043701, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049481, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049875, "step": 18070, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 3.9e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.2e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000296, "vq_loss_layer_025": 0.000366, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.000984, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.259436, "epoch": 0.01808, "grad_norm": 0.0010947233531624079, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.049796, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050204, "step": 18080, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014526, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.001083, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.325745, "epoch": 0.01809, "grad_norm": 0.0010256720706820488, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119629, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049438, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.049847, "step": 18090, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.014954, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.02417, "value_mse_loss_layer_024": 0.026611, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.027954, "value_mse_loss_layer_027": 0.035889, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.046387, "value_mse_loss_layer_030": 0.050781, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000212, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000431, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000671, "vq_loss_layer_030": 0.001419, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.299615, "epoch": 0.0181, "grad_norm": 0.0012756985379382968, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.094238, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049435, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049805, "step": 18100, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000219, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003036 }, { "ce_loss": 2.301553, "epoch": 0.01811, "grad_norm": 0.0011573426891118288, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.058594, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.044678, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.049976, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050372, "step": 18110, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.004211, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007782, "value_mse_loss_layer_005": 0.007172, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.00946, "value_mse_loss_layer_008": 0.011475, "value_mse_loss_layer_009": 0.016479, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013672, "value_mse_loss_layer_012": 0.014954, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.019165, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6.6e-05, "vq_loss_layer_005": 7.9e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000196, "vq_loss_layer_009": 0.000282, "vq_loss_layer_010": 0.000241, "vq_loss_layer_011": 0.000242, "vq_loss_layer_012": 0.000357, "vq_loss_layer_013": 0.00038, "vq_loss_layer_014": 0.000431, "vq_loss_layer_015": 0.000463, "vq_loss_layer_016": 0.000393, "vq_loss_layer_017": 0.000381, "vq_loss_layer_018": 0.000223, "vq_loss_layer_019": 0.000246, "vq_loss_layer_020": 0.000277, "vq_loss_layer_021": 0.00045, "vq_loss_layer_022": 0.000341, "vq_loss_layer_023": 0.000332, "vq_loss_layer_024": 0.000393, "vq_loss_layer_025": 0.000471, "vq_loss_layer_026": 0.000553, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.00095, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.00415 }, { "ce_loss": 2.331313, "epoch": 0.01812, "grad_norm": 0.0009950330713763833, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049548, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049921, "step": 18120, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.058838, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9e-05, "vq_loss_layer_007": 0.000137, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000179, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000239, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000927, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.273386, "epoch": 0.01813, "grad_norm": 0.0012375651858747005, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049866, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050287, "step": 18130, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013306, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 7.3e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000211, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000232, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.002258, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.340158, "epoch": 0.01814, "grad_norm": 0.0010846015065908432, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.050104, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050485, "step": 18140, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000412, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000241, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000437, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000866, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.304029, "epoch": 0.01815, "grad_norm": 0.0010207126615568995, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.049686, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050067, "step": 18150, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.024658, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.036133, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.053955, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000228, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000207, "vq_loss_layer_024": 0.00024, "vq_loss_layer_025": 0.000227, "vq_loss_layer_026": 0.000334, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.257788, "epoch": 0.01816, "grad_norm": 0.0010864812647923827, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.049747, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050156, "step": 18160, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000286, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000618, "vq_loss_layer_028": 0.000999, "vq_loss_layer_029": 0.001427, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.003586 }, { "ce_loss": 2.320393, "epoch": 0.01817, "grad_norm": 0.001162375439889729, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049698, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050125, "step": 18170, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.032471, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.054688, "value_mse_loss_layer_030": 0.059326, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000401, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.00021, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000156, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000196, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000399, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.00095, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.347385, "epoch": 0.01818, "grad_norm": 0.00098403450101614, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.052979, "key_mse_loss_layer_004": 0.060791, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.049329, "kv_vq_loss": 0.000371, "learning_rate": 0.001, "loss": 0.04968, "step": 18180, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006531, "value_mse_loss_layer_005": 0.00592, "value_mse_loss_layer_006": 0.007935, "value_mse_loss_layer_007": 0.00824, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014404, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013123, "value_mse_loss_layer_013": 0.014465, "value_mse_loss_layer_014": 0.015198, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.6e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000179, "vq_loss_layer_010": 0.000152, "vq_loss_layer_011": 0.000173, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000167, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000205, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.000359, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.00058, "vq_loss_layer_029": 0.000736, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002335 }, { "ce_loss": 2.248629, "epoch": 0.01819, "grad_norm": 0.0012048622593283653, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049991, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050391, "step": 18190, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.000862, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.290018, "epoch": 0.0182, "grad_norm": 0.001053577521815896, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049899, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050293, "step": 18200, "value_mse_loss_layer_000": 0.000391, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014404, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.031006, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.054932, "value_mse_loss_layer_030": 0.059814, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000164, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.000385, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.246354, "epoch": 0.01821, "grad_norm": 0.0012220784556120634, "key_mse_loss_layer_000": 0.003799, "key_mse_loss_layer_001": 0.012451, "key_mse_loss_layer_002": 0.066406, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.10498, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.094727, "key_mse_loss_layer_020": 0.101562, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.103516, "key_mse_loss_layer_024": 0.086914, "key_mse_loss_layer_025": 0.084473, "key_mse_loss_layer_026": 0.095703, "key_mse_loss_layer_027": 0.104492, "key_mse_loss_layer_028": 0.106445, "key_mse_loss_layer_029": 0.103516, "key_mse_loss_layer_030": 0.100586, "key_mse_loss_layer_031": 0.086426, "kv_mse_loss": 0.050333, "kv_vq_loss": 0.000404, "learning_rate": 0.001, "loss": 0.050757, "step": 18210, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001099, "value_mse_loss_layer_002": 0.004303, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.00769, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010437, "value_mse_loss_layer_009": 0.013489, "value_mse_loss_layer_010": 0.010803, "value_mse_loss_layer_011": 0.011597, "value_mse_loss_layer_012": 0.012329, "value_mse_loss_layer_013": 0.013733, "value_mse_loss_layer_014": 0.015137, "value_mse_loss_layer_015": 0.016479, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.0177, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.022095, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.026367, "value_mse_loss_layer_023": 0.030762, "value_mse_loss_layer_024": 0.035889, "value_mse_loss_layer_025": 0.041504, "value_mse_loss_layer_026": 0.039551, "value_mse_loss_layer_027": 0.051758, "value_mse_loss_layer_028": 0.056885, "value_mse_loss_layer_029": 0.06543, "value_mse_loss_layer_030": 0.074707, "value_mse_loss_layer_031": 0.057373, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.2e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 9e-05, "vq_loss_layer_007": 0.000115, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000149, "vq_loss_layer_010": 0.000133, "vq_loss_layer_011": 0.000154, "vq_loss_layer_012": 0.000238, "vq_loss_layer_013": 0.000196, "vq_loss_layer_014": 0.00028, "vq_loss_layer_015": 0.000301, "vq_loss_layer_016": 0.000259, "vq_loss_layer_017": 0.000229, "vq_loss_layer_018": 0.000257, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000125, "vq_loss_layer_021": 0.000214, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000174, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.001198, "vq_loss_layer_029": 0.001564, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.004578 }, { "ce_loss": 2.290665, "epoch": 0.01822, "grad_norm": 0.0009933116380125284, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.050058, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.05047, "step": 18220, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016357, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000244, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000284, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000154, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000236, "vq_loss_layer_026": 0.000359, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.002411 }, { "ce_loss": 2.323238, "epoch": 0.01823, "grad_norm": 0.0010076946346089244, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.081055, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.073242, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.049362, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.049762, "step": 18230, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.028809, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 3.8e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000139, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000186, "vq_loss_layer_010": 0.000149, "vq_loss_layer_011": 0.000169, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000242, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000334, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000153, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000847, "vq_loss_layer_029": 0.001373, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003998 }, { "ce_loss": 2.293979, "epoch": 0.01824, "grad_norm": 0.0012075951090082526, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.065918, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.072754, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.075195, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049921, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050305, "step": 18240, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.02832, "value_mse_loss_layer_027": 0.035645, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.046875, "value_mse_loss_layer_030": 0.050781, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000341, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000366, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000374, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.00325 }, { "ce_loss": 2.350856, "epoch": 0.01825, "grad_norm": 0.0011351348366588354, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049527, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.04993, "step": 18250, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000179, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000173, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000198, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000214, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000427, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000683, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.002411 }, { "ce_loss": 2.266395, "epoch": 0.01826, "grad_norm": 0.0011624643811956048, "key_mse_loss_layer_000": 0.003708, "key_mse_loss_layer_001": 0.011475, "key_mse_loss_layer_002": 0.061523, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.08252, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.092285, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.102539, "key_mse_loss_layer_014": 0.101074, "key_mse_loss_layer_015": 0.092285, "key_mse_loss_layer_016": 0.085938, "key_mse_loss_layer_017": 0.089844, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.080566, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.094727, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.094727, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.049759, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050165, "step": 18260, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001068, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.008179, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014343, "value_mse_loss_layer_010": 0.011414, "value_mse_loss_layer_011": 0.011963, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014465, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.017212, "value_mse_loss_layer_019": 0.019775, "value_mse_loss_layer_020": 0.022217, "value_mse_loss_layer_021": 0.024902, "value_mse_loss_layer_022": 0.026733, "value_mse_loss_layer_023": 0.03064, "value_mse_loss_layer_024": 0.035156, "value_mse_loss_layer_025": 0.040039, "value_mse_loss_layer_026": 0.038818, "value_mse_loss_layer_027": 0.050537, "value_mse_loss_layer_028": 0.055664, "value_mse_loss_layer_029": 0.064453, "value_mse_loss_layer_030": 0.069336, "value_mse_loss_layer_031": 0.053711, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.7e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 4.6e-05, "vq_loss_layer_006": 8.5e-05, "vq_loss_layer_007": 0.00013, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000173, "vq_loss_layer_010": 0.000148, "vq_loss_layer_011": 0.000153, "vq_loss_layer_012": 0.000269, "vq_loss_layer_013": 0.000231, "vq_loss_layer_014": 0.00029, "vq_loss_layer_015": 0.000319, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000242, "vq_loss_layer_018": 0.000232, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000152, "vq_loss_layer_021": 0.000252, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000195, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.000568, "vq_loss_layer_028": 0.001144, "vq_loss_layer_029": 0.001434, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.26739, "epoch": 0.01827, "grad_norm": 0.001235125819221139, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053711, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.04953, "kv_vq_loss": 0.00038, "learning_rate": 0.001, "loss": 0.049899, "step": 18270, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007111, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023926, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000307, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.00014, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000201, "vq_loss_layer_024": 0.000194, "vq_loss_layer_025": 0.000214, "vq_loss_layer_026": 0.000343, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.00058, "vq_loss_layer_029": 0.000805, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.002441 }, { "ce_loss": 2.300663, "epoch": 0.01828, "grad_norm": 0.001048313919454813, "key_mse_loss_layer_000": 0.00386, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.060547, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.075684, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.115723, "key_mse_loss_layer_016": 0.109375, "key_mse_loss_layer_017": 0.110352, "key_mse_loss_layer_018": 0.119629, "key_mse_loss_layer_019": 0.096191, "key_mse_loss_layer_020": 0.108887, "key_mse_loss_layer_021": 0.101562, "key_mse_loss_layer_022": 0.10498, "key_mse_loss_layer_023": 0.10498, "key_mse_loss_layer_024": 0.082031, "key_mse_loss_layer_025": 0.080566, "key_mse_loss_layer_026": 0.093262, "key_mse_loss_layer_027": 0.093262, "key_mse_loss_layer_028": 0.100586, "key_mse_loss_layer_029": 0.087891, "key_mse_loss_layer_030": 0.096191, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049686, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050104, "step": 18280, "value_mse_loss_layer_000": 0.000397, "value_mse_loss_layer_001": 0.001083, "value_mse_loss_layer_002": 0.00415, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007599, "value_mse_loss_layer_005": 0.006927, "value_mse_loss_layer_006": 0.008972, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.02478, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.036621, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.051514, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.00014, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000227, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000387, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.000425, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.001022, "vq_loss_layer_029": 0.000954, "vq_loss_layer_030": 0.002213, "vq_loss_layer_031": 0.004211 }, { "ce_loss": 2.293618, "epoch": 0.01829, "grad_norm": 0.0010471896966919303, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049527, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049908, "step": 18290, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.7e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000301, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000123, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.000172, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000175, "vq_loss_layer_025": 0.000205, "vq_loss_layer_026": 0.000351, "vq_loss_layer_027": 0.000378, "vq_loss_layer_028": 0.000561, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.00145, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.360765, "epoch": 0.0183, "grad_norm": 0.0012947501381859183, "key_mse_loss_layer_000": 0.003601, "key_mse_loss_layer_001": 0.011353, "key_mse_loss_layer_002": 0.05957, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.089844, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.104004, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.109375, "key_mse_loss_layer_018": 0.115723, "key_mse_loss_layer_019": 0.095703, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.098145, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049579, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.049991, "step": 18300, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001076, "value_mse_loss_layer_002": 0.004272, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007568, "value_mse_loss_layer_005": 0.006958, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.011414, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012939, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016846, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.023804, "value_mse_loss_layer_024": 0.026978, "value_mse_loss_layer_025": 0.031738, "value_mse_loss_layer_026": 0.028564, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.040527, "value_mse_loss_layer_029": 0.047363, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.6e-05, "vq_loss_layer_006": 0.000114, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.00019, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000233, "vq_loss_layer_011": 0.000227, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000326, "vq_loss_layer_014": 0.000435, "vq_loss_layer_015": 0.000406, "vq_loss_layer_016": 0.000349, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000196, "vq_loss_layer_020": 0.000218, "vq_loss_layer_021": 0.000355, "vq_loss_layer_022": 0.000301, "vq_loss_layer_023": 0.000269, "vq_loss_layer_024": 0.000319, "vq_loss_layer_025": 0.000393, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.000946, "vq_loss_layer_030": 0.002701, "vq_loss_layer_031": 0.003815 }, { "ce_loss": 2.298511, "epoch": 0.01831, "grad_norm": 0.0010576598579064012, "key_mse_loss_layer_000": 0.002731, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049579, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049954, "step": 18310, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000131, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000152, "vq_loss_layer_011": 0.000171, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000164, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000381, "vq_loss_layer_028": 0.000633, "vq_loss_layer_029": 0.000668, "vq_loss_layer_030": 0.00135, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.353819, "epoch": 0.01832, "grad_norm": 0.00106052088085562, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049701, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050085, "step": 18320, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.002838 }, { "ce_loss": 2.26743, "epoch": 0.01833, "grad_norm": 0.0012501785531640053, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.04946, "kv_vq_loss": 0.000402, "learning_rate": 0.001, "loss": 0.049878, "step": 18330, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.015015, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000319, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000188, "vq_loss_layer_020": 0.000229, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000332, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.320134, "epoch": 0.01834, "grad_norm": 0.0010368741350248456, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05542, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049426, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049805, "step": 18340, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000203, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000399, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001595, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.313879, "epoch": 0.01835, "grad_norm": 0.0011134237283840775, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049823, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050211, "step": 18350, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000198, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000429, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.001648, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.293138, "epoch": 0.01836, "grad_norm": 0.0012668919516727328, "key_mse_loss_layer_000": 0.003387, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.04986, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050256, "step": 18360, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.048584, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000816, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.294987, "epoch": 0.01837, "grad_norm": 0.0009146198281086981, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.046143, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.091797, "key_mse_loss_layer_010": 0.104492, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.100586, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.084961, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.04971, "kv_vq_loss": 0.000376, "learning_rate": 0.001, "loss": 0.050076, "step": 18370, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.004181, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.060059, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000187, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000277, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000538, "vq_loss_layer_028": 0.001213, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003662 }, { "ce_loss": 2.266661, "epoch": 0.01838, "grad_norm": 0.0013207120355218649, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.106445, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.106445, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049567, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.049957, "step": 18380, "value_mse_loss_layer_000": 0.00036, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000166, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000282, "vq_loss_layer_025": 0.000313, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.00087, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003082 }, { "ce_loss": 2.322731, "epoch": 0.01839, "grad_norm": 0.001078541623428464, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.052002, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049658, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050098, "step": 18390, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014709, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.016846, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.018677, "value_mse_loss_layer_021": 0.021484, "value_mse_loss_layer_022": 0.021851, "value_mse_loss_layer_023": 0.024048, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.031982, "value_mse_loss_layer_026": 0.028442, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000181, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000241, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000307, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.001854, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.373544, "epoch": 0.0184, "grad_norm": 0.0010728861670941114, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.061768, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.088867, "key_mse_loss_layer_031": 0.080078, "kv_mse_loss": 0.049576, "kv_vq_loss": 0.000379, "learning_rate": 0.001, "loss": 0.049954, "step": 18400, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007019, "value_mse_loss_layer_004": 0.00647, "value_mse_loss_layer_005": 0.005951, "value_mse_loss_layer_006": 0.007996, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029663, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 2e-06, "vq_loss_layer_003": 1.1e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 4.7e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000137, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000149, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000307, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000133, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.000201, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001266, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.284512, "epoch": 0.01841, "grad_norm": 0.0010367308277636766, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049634, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050027, "step": 18410, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000243, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000315, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000136, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000305, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000254, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000399, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.001503, "vq_loss_layer_031": 0.002502 }, { "ce_loss": 2.29796, "epoch": 0.01842, "grad_norm": 0.0011219624429941177, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049387, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.049747, "step": 18420, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010498, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.028809, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.048096, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000132, "vq_loss_layer_009": 0.000189, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000156, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000391, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000664, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.002487 }, { "ce_loss": 2.313902, "epoch": 0.01843, "grad_norm": 0.0010954315075650811, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.054688, "key_mse_loss_layer_004": 0.061035, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.074219, "kv_mse_loss": 0.049493, "kv_vq_loss": 0.000379, "learning_rate": 0.001, "loss": 0.049875, "step": 18430, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000163, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000186, "vq_loss_layer_023": 0.000203, "vq_loss_layer_024": 0.000204, "vq_loss_layer_025": 0.000223, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000584, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.297649, "epoch": 0.01844, "grad_norm": 0.001186351990327239, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049701, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050107, "step": 18440, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029175, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.047607, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000366, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000309, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000431, "vq_loss_layer_028": 0.000824, "vq_loss_layer_029": 0.000725, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.302054, "epoch": 0.01845, "grad_norm": 0.0010102232918143272, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049969, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050351, "step": 18450, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000218, "vq_loss_layer_024": 0.000195, "vq_loss_layer_025": 0.000219, "vq_loss_layer_026": 0.000359, "vq_loss_layer_027": 0.000406, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000725, "vq_loss_layer_030": 0.00145, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.301235, "epoch": 0.01846, "grad_norm": 0.0010863062925636768, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.049683, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050061, "step": 18460, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007141, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052979, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 2e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000215, "vq_loss_layer_019": 0.000127, "vq_loss_layer_020": 0.000166, "vq_loss_layer_021": 0.000254, "vq_loss_layer_022": 0.000173, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.00019, "vq_loss_layer_025": 0.000211, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000399, "vq_loss_layer_028": 0.000576, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.001465, "vq_loss_layer_031": 0.002487 }, { "ce_loss": 2.301747, "epoch": 0.01847, "grad_norm": 0.0012051499215885997, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.04985, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050256, "step": 18470, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.052246, "value_mse_loss_layer_031": 0.044434, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000299, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000374, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000231, "vq_loss_layer_024": 0.000192, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.000397, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002274 }, { "ce_loss": 2.312974, "epoch": 0.01848, "grad_norm": 0.001121529028750956, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049902, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050299, "step": 18480, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009338, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013062, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015198, "value_mse_loss_layer_017": 0.019043, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.023682, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000254, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000397, "vq_loss_layer_017": 0.000397, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000208, "vq_loss_layer_020": 0.000242, "vq_loss_layer_021": 0.000385, "vq_loss_layer_022": 0.000299, "vq_loss_layer_023": 0.00032, "vq_loss_layer_024": 0.000305, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000546, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.002045, "vq_loss_layer_031": 0.003326 }, { "ce_loss": 2.306804, "epoch": 0.01849, "grad_norm": 0.0011106572346761823, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.052734, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049777, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.050171, "step": 18490, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029053, "value_mse_loss_layer_027": 0.035889, "value_mse_loss_layer_028": 0.041504, "value_mse_loss_layer_029": 0.047607, "value_mse_loss_layer_030": 0.051758, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000199, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000412, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000713, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.302129, "epoch": 0.0185, "grad_norm": 0.001196389552205801, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.074707, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.092773, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.09375, "key_mse_loss_layer_031": 0.081055, "kv_mse_loss": 0.04978, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050165, "step": 18500, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.00708, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000242, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.318139, "epoch": 0.01851, "grad_norm": 0.0011405479162931442, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049863, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050284, "step": 18510, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000186, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000912, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.349075, "epoch": 0.01852, "grad_norm": 0.001143883098848164, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.085449, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.085938, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049271, "kv_vq_loss": 0.000374, "learning_rate": 0.001, "loss": 0.049628, "step": 18520, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007782, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014221, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013245, "value_mse_loss_layer_013": 0.014465, "value_mse_loss_layer_014": 0.015198, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.04126, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000132, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000227, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000208, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000307, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000851, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.002396, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.333701, "epoch": 0.01853, "grad_norm": 0.001145741785876453, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049643, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050046, "step": 18530, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.000462, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000797, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.003235 }, { "ce_loss": 2.333052, "epoch": 0.01854, "grad_norm": 0.0010623818961903453, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049762, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.050143, "step": 18540, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.00708, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.02771, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029053, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000322, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000357, "vq_loss_layer_027": 0.000423, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.00071, "vq_loss_layer_030": 0.001266, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.281096, "epoch": 0.01855, "grad_norm": 0.0010960822692140937, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049942, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050314, "step": 18550, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000149, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000324, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.00019, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.00036, "vq_loss_layer_027": 0.000418, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.002197, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.302711, "epoch": 0.01856, "grad_norm": 0.0011721046175807714, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.103027, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.12207, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.109863, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049805, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050217, "step": 18560, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.011292, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000317, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.00033, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000194, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000261, "vq_loss_layer_023": 0.000286, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000465, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.311806, "epoch": 0.01857, "grad_norm": 0.0011004701955243945, "key_mse_loss_layer_000": 0.003586, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.049402, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049792, "step": 18570, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.035156, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.053467, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.00029, "vq_loss_layer_026": 0.000458, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.002823 }, { "ce_loss": 2.298083, "epoch": 0.01858, "grad_norm": 0.0011696695582941175, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049484, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049881, "step": 18580, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.00383, "value_mse_loss_layer_003": 0.007141, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000197, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000226, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.002396 }, { "ce_loss": 2.261858, "epoch": 0.01859, "grad_norm": 0.0011740182526409626, "key_mse_loss_layer_000": 0.003708, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.082031, "key_mse_loss_layer_010": 0.094727, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.107422, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.083984, "key_mse_loss_layer_025": 0.07959, "key_mse_loss_layer_026": 0.088867, "key_mse_loss_layer_027": 0.09375, "key_mse_loss_layer_028": 0.097656, "key_mse_loss_layer_029": 0.095703, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049963, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050372, "step": 18590, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007935, "value_mse_loss_layer_004": 0.007446, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.016113, "value_mse_loss_layer_014": 0.01709, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015381, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.016968, "value_mse_loss_layer_019": 0.020142, "value_mse_loss_layer_020": 0.021362, "value_mse_loss_layer_021": 0.02417, "value_mse_loss_layer_022": 0.025513, "value_mse_loss_layer_023": 0.03125, "value_mse_loss_layer_024": 0.03833, "value_mse_loss_layer_025": 0.040283, "value_mse_loss_layer_026": 0.040771, "value_mse_loss_layer_027": 0.051514, "value_mse_loss_layer_028": 0.058594, "value_mse_loss_layer_029": 0.067871, "value_mse_loss_layer_030": 0.074219, "value_mse_loss_layer_031": 0.056885, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.1e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.2e-05, "vq_loss_layer_007": 0.000128, "vq_loss_layer_008": 0.000184, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.00025, "vq_loss_layer_018": 0.000144, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000124, "vq_loss_layer_021": 0.00023, "vq_loss_layer_022": 0.000178, "vq_loss_layer_023": 0.000174, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.001083, "vq_loss_layer_029": 0.00119, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.004028 }, { "ce_loss": 2.291454, "epoch": 0.0186, "grad_norm": 0.0010554967448115349, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.059326, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.126953, "key_mse_loss_layer_015": 0.11377, "key_mse_loss_layer_016": 0.108398, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.102051, "key_mse_loss_layer_023": 0.101562, "key_mse_loss_layer_024": 0.080566, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.087891, "key_mse_loss_layer_027": 0.085449, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.049966, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050378, "step": 18600, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012207, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.016357, "value_mse_loss_layer_016": 0.013184, "value_mse_loss_layer_017": 0.016968, "value_mse_loss_layer_018": 0.014709, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021362, "value_mse_loss_layer_022": 0.021118, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.031982, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.041504, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000345, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000383, "vq_loss_layer_026": 0.00046, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.000977, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.275678, "epoch": 0.01861, "grad_norm": 0.0012204383965581656, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.096191, "key_mse_loss_layer_010": 0.108398, "key_mse_loss_layer_011": 0.104492, "key_mse_loss_layer_012": 0.078125, "key_mse_loss_layer_013": 0.133789, "key_mse_loss_layer_014": 0.129883, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.11084, "key_mse_loss_layer_017": 0.10791, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.10498, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.104004, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.080566, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.096191, "key_mse_loss_layer_029": 0.088379, "key_mse_loss_layer_030": 0.094238, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.05015, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050562, "step": 18610, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016724, "value_mse_loss_layer_016": 0.013367, "value_mse_loss_layer_017": 0.016968, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.016846, "value_mse_loss_layer_020": 0.018799, "value_mse_loss_layer_021": 0.020874, "value_mse_loss_layer_022": 0.02124, "value_mse_loss_layer_023": 0.022949, "value_mse_loss_layer_024": 0.026367, "value_mse_loss_layer_025": 0.030884, "value_mse_loss_layer_026": 0.027832, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.047363, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000115, "vq_loss_layer_007": 0.000166, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000376, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.000338, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000347, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000313, "vq_loss_layer_025": 0.000454, "vq_loss_layer_026": 0.000576, "vq_loss_layer_027": 0.000641, "vq_loss_layer_028": 0.000969, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002747, "vq_loss_layer_031": 0.003754 }, { "ce_loss": 2.274569, "epoch": 0.01862, "grad_norm": 0.0010804679477587342, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.062988, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.082031, "kv_mse_loss": 0.049655, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050052, "step": 18620, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006012, "value_mse_loss_layer_006": 0.007935, "value_mse_loss_layer_007": 0.008301, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 2e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000171, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000161, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000273, "vq_loss_layer_022": 0.000182, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000197, "vq_loss_layer_025": 0.000222, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000553, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.304569, "epoch": 0.01863, "grad_norm": 0.0009842824656516314, "key_mse_loss_layer_000": 0.003601, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.047119, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049503, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.049887, "step": 18630, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.011536, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014709, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.016968, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.00019, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000165, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000362, "vq_loss_layer_026": 0.000471, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000801, "vq_loss_layer_029": 0.001045, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.302949, "epoch": 0.01864, "grad_norm": 0.0011274493299424648, "key_mse_loss_layer_000": 0.002777, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.064941, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.067871, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.094727, "key_mse_loss_layer_009": 0.098145, "key_mse_loss_layer_010": 0.110352, "key_mse_loss_layer_011": 0.106934, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.128906, "key_mse_loss_layer_014": 0.125, "key_mse_loss_layer_015": 0.110352, "key_mse_loss_layer_016": 0.106934, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.117676, "key_mse_loss_layer_019": 0.095215, "key_mse_loss_layer_020": 0.107422, "key_mse_loss_layer_021": 0.101562, "key_mse_loss_layer_022": 0.10791, "key_mse_loss_layer_023": 0.109375, "key_mse_loss_layer_024": 0.088867, "key_mse_loss_layer_025": 0.083496, "key_mse_loss_layer_026": 0.100098, "key_mse_loss_layer_027": 0.100586, "key_mse_loss_layer_028": 0.10791, "key_mse_loss_layer_029": 0.093262, "key_mse_loss_layer_030": 0.10791, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.049997, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050412, "step": 18640, "value_mse_loss_layer_000": 0.000319, "value_mse_loss_layer_001": 0.000942, "value_mse_loss_layer_002": 0.003769, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.007721, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010193, "value_mse_loss_layer_009": 0.013428, "value_mse_loss_layer_010": 0.01123, "value_mse_loss_layer_011": 0.012085, "value_mse_loss_layer_012": 0.012695, "value_mse_loss_layer_013": 0.013611, "value_mse_loss_layer_014": 0.014343, "value_mse_loss_layer_015": 0.014404, "value_mse_loss_layer_016": 0.012085, "value_mse_loss_layer_017": 0.014526, "value_mse_loss_layer_018": 0.014526, "value_mse_loss_layer_019": 0.015991, "value_mse_loss_layer_020": 0.017456, "value_mse_loss_layer_021": 0.019409, "value_mse_loss_layer_022": 0.019775, "value_mse_loss_layer_023": 0.022827, "value_mse_loss_layer_024": 0.026733, "value_mse_loss_layer_025": 0.030762, "value_mse_loss_layer_026": 0.027466, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.046631, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 2.3e-05, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 7.2e-05, "vq_loss_layer_006": 8.9e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000193, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000284, "vq_loss_layer_013": 0.000224, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000296, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000197, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000133, "vq_loss_layer_020": 0.000168, "vq_loss_layer_021": 0.000246, "vq_loss_layer_022": 0.000257, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000401, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.000561, "vq_loss_layer_028": 0.001373, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.002274, "vq_loss_layer_031": 0.004364 }, { "ce_loss": 2.308931, "epoch": 0.01865, "grad_norm": 0.0009932543616741896, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057129, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.10791, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.117188, "key_mse_loss_layer_016": 0.111816, "key_mse_loss_layer_017": 0.111816, "key_mse_loss_layer_018": 0.117676, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.11084, "key_mse_loss_layer_021": 0.106445, "key_mse_loss_layer_022": 0.110352, "key_mse_loss_layer_023": 0.10498, "key_mse_loss_layer_024": 0.085449, "key_mse_loss_layer_025": 0.081543, "key_mse_loss_layer_026": 0.094727, "key_mse_loss_layer_027": 0.09375, "key_mse_loss_layer_028": 0.101074, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.099121, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.049911, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050296, "step": 18650, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.013733, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.02478, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000165, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000231, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000347, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000171, "vq_loss_layer_019": 0.000194, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.000374, "vq_loss_layer_022": 0.000282, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000294, "vq_loss_layer_025": 0.00037, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.310356, "epoch": 0.01866, "grad_norm": 0.0012558716116473079, "key_mse_loss_layer_000": 0.003571, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.083008, "key_mse_loss_layer_010": 0.095215, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.098145, "key_mse_loss_layer_020": 0.107422, "key_mse_loss_layer_021": 0.101074, "key_mse_loss_layer_022": 0.103516, "key_mse_loss_layer_023": 0.10791, "key_mse_loss_layer_024": 0.092285, "key_mse_loss_layer_025": 0.088867, "key_mse_loss_layer_026": 0.097656, "key_mse_loss_layer_027": 0.10498, "key_mse_loss_layer_028": 0.108398, "key_mse_loss_layer_029": 0.112793, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.07959, "kv_mse_loss": 0.049777, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050171, "step": 18660, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.011536, "value_mse_loss_layer_009": 0.015869, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014648, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.017334, "value_mse_loss_layer_015": 0.019409, "value_mse_loss_layer_016": 0.016357, "value_mse_loss_layer_017": 0.02063, "value_mse_loss_layer_018": 0.017456, "value_mse_loss_layer_019": 0.022217, "value_mse_loss_layer_020": 0.024536, "value_mse_loss_layer_021": 0.026489, "value_mse_loss_layer_022": 0.029297, "value_mse_loss_layer_023": 0.033203, "value_mse_loss_layer_024": 0.044434, "value_mse_loss_layer_025": 0.04541, "value_mse_loss_layer_026": 0.047607, "value_mse_loss_layer_027": 0.05835, "value_mse_loss_layer_028": 0.069336, "value_mse_loss_layer_029": 0.080078, "value_mse_loss_layer_030": 0.083496, "value_mse_loss_layer_031": 0.0625, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000137, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000175, "vq_loss_layer_010": 0.000143, "vq_loss_layer_011": 0.000162, "vq_loss_layer_012": 0.000273, "vq_loss_layer_013": 0.000231, "vq_loss_layer_014": 0.000259, "vq_loss_layer_015": 0.000288, "vq_loss_layer_016": 0.000227, "vq_loss_layer_017": 0.000185, "vq_loss_layer_018": 0.000114, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000114, "vq_loss_layer_021": 0.000203, "vq_loss_layer_022": 0.000168, "vq_loss_layer_023": 0.00018, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000359, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001175, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.003067 }, { "ce_loss": 2.318879, "epoch": 0.01867, "grad_norm": 0.001069416874088347, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049628, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050012, "step": 18670, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000194, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000402, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000452, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.00293 }, { "ce_loss": 2.340512, "epoch": 0.01868, "grad_norm": 0.0010809408267959952, "key_mse_loss_layer_000": 0.003494, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049847, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.05025, "step": 18680, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007385, "value_mse_loss_layer_005": 0.006866, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009277, "value_mse_loss_layer_008": 0.01123, "value_mse_loss_layer_009": 0.015503, "value_mse_loss_layer_010": 0.012634, "value_mse_loss_layer_011": 0.013245, "value_mse_loss_layer_012": 0.014404, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000124, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.00017, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000204, "vq_loss_layer_011": 0.000217, "vq_loss_layer_012": 0.000336, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000416, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000195, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000221, "vq_loss_layer_021": 0.000399, "vq_loss_layer_022": 0.000288, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000307, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000549, "vq_loss_layer_027": 0.000668, "vq_loss_layer_028": 0.00082, "vq_loss_layer_029": 0.001007, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.307269, "epoch": 0.01869, "grad_norm": 0.0010634125210344791, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.05007, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050452, "step": 18690, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.053223, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.00037, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000553, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000999, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.313327, "epoch": 0.0187, "grad_norm": 0.001473759301006794, "key_mse_loss_layer_000": 0.002731, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.092773, "key_mse_loss_layer_009": 0.098633, "key_mse_loss_layer_010": 0.114258, "key_mse_loss_layer_011": 0.109863, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.145508, "key_mse_loss_layer_014": 0.142578, "key_mse_loss_layer_015": 0.130859, "key_mse_loss_layer_016": 0.12793, "key_mse_loss_layer_017": 0.125977, "key_mse_loss_layer_018": 0.131836, "key_mse_loss_layer_019": 0.107422, "key_mse_loss_layer_020": 0.123535, "key_mse_loss_layer_021": 0.115723, "key_mse_loss_layer_022": 0.122559, "key_mse_loss_layer_023": 0.121582, "key_mse_loss_layer_024": 0.095215, "key_mse_loss_layer_025": 0.088867, "key_mse_loss_layer_026": 0.106445, "key_mse_loss_layer_027": 0.100586, "key_mse_loss_layer_028": 0.110352, "key_mse_loss_layer_029": 0.09668, "key_mse_loss_layer_030": 0.121582, "key_mse_loss_layer_031": 0.082031, "kv_mse_loss": 0.049976, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.0504, "step": 18700, "value_mse_loss_layer_000": 0.000343, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007874, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.008972, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.016602, "value_mse_loss_layer_016": 0.013306, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000127, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000218, "vq_loss_layer_010": 0.000201, "vq_loss_layer_011": 0.000216, "vq_loss_layer_012": 0.000362, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000237, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000347, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.003845 }, { "ce_loss": 2.302676, "epoch": 0.01871, "grad_norm": 0.001122549525462091, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049933, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.050339, "step": 18710, "value_mse_loss_layer_000": 0.000385, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000322, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000299, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000862, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.313249, "epoch": 0.01872, "grad_norm": 0.0010432421695441008, "key_mse_loss_layer_000": 0.003448, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049747, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050134, "step": 18720, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.2e-05, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000254, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000576, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000931, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.335036, "epoch": 0.01873, "grad_norm": 0.0012282145908102393, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049548, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.049963, "step": 18730, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000324, "vq_loss_layer_022": 0.000229, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000416, "vq_loss_layer_028": 0.000668, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001663, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.354015, "epoch": 0.01874, "grad_norm": 0.0010448412504047155, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.09668, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.050204, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.050598, "step": 18740, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.02771, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000319, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000252, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000223, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000435, "vq_loss_layer_027": 0.000444, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000698, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.002441 }, { "ce_loss": 2.252552, "epoch": 0.01875, "grad_norm": 0.0012029330246150494, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.04989, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050299, "step": 18750, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000186, "vq_loss_layer_023": 0.000232, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000244, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000599, "vq_loss_layer_029": 0.00079, "vq_loss_layer_030": 0.002426, "vq_loss_layer_031": 0.002411 }, { "ce_loss": 2.339571, "epoch": 0.01876, "grad_norm": 0.0011215454433113337, "key_mse_loss_layer_000": 0.003067, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.095215, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049789, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050186, "step": 18760, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000136, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.00038, "vq_loss_layer_022": 0.000218, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.001625, "vq_loss_layer_031": 0.002609 }, { "ce_loss": 2.333579, "epoch": 0.01877, "grad_norm": 0.0012454614043235779, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.056885, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049579, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049954, "step": 18770, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000408, "vq_loss_layer_016": 0.000284, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000191, "vq_loss_layer_024": 0.000187, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.293019, "epoch": 0.01878, "grad_norm": 0.001127680647186935, "key_mse_loss_layer_000": 0.003937, "key_mse_loss_layer_001": 0.011292, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.075195, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.086426, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.099609, "key_mse_loss_layer_031": 0.087891, "kv_mse_loss": 0.049802, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.05022, "step": 18780, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.019409, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.031128, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.033447, "value_mse_loss_layer_027": 0.041992, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.0625, "value_mse_loss_layer_031": 0.049316, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000222, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000198, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000184, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000286, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000288, "vq_loss_layer_024": 0.000359, "vq_loss_layer_025": 0.000381, "vq_loss_layer_026": 0.000607, "vq_loss_layer_027": 0.00095, "vq_loss_layer_028": 0.001274, "vq_loss_layer_029": 0.002014, "vq_loss_layer_030": 0.003586, "vq_loss_layer_031": 0.004974 }, { "ce_loss": 2.299172, "epoch": 0.01879, "grad_norm": 0.0013259418774396181, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049646, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050037, "step": 18790, "value_mse_loss_layer_000": 0.000383, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.00018, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000169, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000353, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000278, "vq_loss_layer_024": 0.000265, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000607, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000851, "vq_loss_layer_030": 0.001938, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.320617, "epoch": 0.0188, "grad_norm": 0.0010268631158396602, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069336, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.04938, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049765, "step": 18800, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.030396, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5.4e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000294, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001869, "vq_loss_layer_031": 0.002563 }, { "ce_loss": 2.286643, "epoch": 0.01881, "grad_norm": 0.0010476476745679975, "key_mse_loss_layer_000": 0.003632, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049847, "kv_vq_loss": 0.000411, "learning_rate": 0.001, "loss": 0.050275, "step": 18810, "value_mse_loss_layer_000": 0.00038, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.001671, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.310815, "epoch": 0.01882, "grad_norm": 0.0009730707970447838, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049554, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049915, "step": 18820, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010498, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.011536, "value_mse_loss_layer_011": 0.012146, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029907, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000179, "vq_loss_layer_010": 0.00015, "vq_loss_layer_011": 0.000171, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000338, "vq_loss_layer_016": 0.00028, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000267, "vq_loss_layer_026": 0.000381, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000778, "vq_loss_layer_030": 0.00164, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.295209, "epoch": 0.01883, "grad_norm": 0.0012171394191682339, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.049805, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050204, "step": 18830, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000202, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000353, "vq_loss_layer_026": 0.000519, "vq_loss_layer_027": 0.000626, "vq_loss_layer_028": 0.001122, "vq_loss_layer_029": 0.001862, "vq_loss_layer_030": 0.002884, "vq_loss_layer_031": 0.003967 }, { "ce_loss": 2.312052, "epoch": 0.01884, "grad_norm": 0.001201295992359519, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010559, "key_mse_loss_layer_002": 0.0625, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.063965, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.09375, "key_mse_loss_layer_009": 0.099121, "key_mse_loss_layer_010": 0.112305, "key_mse_loss_layer_011": 0.108398, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.143555, "key_mse_loss_layer_014": 0.137695, "key_mse_loss_layer_015": 0.124023, "key_mse_loss_layer_016": 0.125, "key_mse_loss_layer_017": 0.119141, "key_mse_loss_layer_018": 0.131836, "key_mse_loss_layer_019": 0.103027, "key_mse_loss_layer_020": 0.117188, "key_mse_loss_layer_021": 0.111816, "key_mse_loss_layer_022": 0.121094, "key_mse_loss_layer_023": 0.12207, "key_mse_loss_layer_024": 0.102051, "key_mse_loss_layer_025": 0.091309, "key_mse_loss_layer_026": 0.111328, "key_mse_loss_layer_027": 0.110352, "key_mse_loss_layer_028": 0.114746, "key_mse_loss_layer_029": 0.101074, "key_mse_loss_layer_030": 0.119141, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.049655, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050037, "step": 18840, "value_mse_loss_layer_000": 0.000351, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.013733, "value_mse_loss_layer_010": 0.011475, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013184, "value_mse_loss_layer_013": 0.014221, "value_mse_loss_layer_014": 0.015015, "value_mse_loss_layer_015": 0.015503, "value_mse_loss_layer_016": 0.012939, "value_mse_loss_layer_017": 0.015747, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.01709, "value_mse_loss_layer_020": 0.019043, "value_mse_loss_layer_021": 0.02124, "value_mse_loss_layer_022": 0.021118, "value_mse_loss_layer_023": 0.023926, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.05835, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000209, "vq_loss_layer_010": 0.000202, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000324, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000374, "vq_loss_layer_015": 0.000311, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000236, "vq_loss_layer_018": 0.000203, "vq_loss_layer_019": 0.000187, "vq_loss_layer_020": 0.000204, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000307, "vq_loss_layer_023": 0.00029, "vq_loss_layer_024": 0.000429, "vq_loss_layer_025": 0.000526, "vq_loss_layer_026": 0.000591, "vq_loss_layer_027": 0.000797, "vq_loss_layer_028": 0.001259, "vq_loss_layer_029": 0.001114, "vq_loss_layer_030": 0.002731, "vq_loss_layer_031": 0.004333 }, { "ce_loss": 2.317797, "epoch": 0.01885, "grad_norm": 0.0010603036498650908, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.116211, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.050146, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050555, "step": 18850, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.00103, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000334, "vq_loss_layer_016": 0.000326, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000305, "vq_loss_layer_026": 0.000496, "vq_loss_layer_027": 0.000599, "vq_loss_layer_028": 0.000813, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003143 }, { "ce_loss": 2.332084, "epoch": 0.01886, "grad_norm": 0.0011135510867461562, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.095215, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049268, "kv_vq_loss": 0.000374, "learning_rate": 0.001, "loss": 0.049637, "step": 18860, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007111, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000161, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000181, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000209, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000205, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.324024, "epoch": 0.01887, "grad_norm": 0.0010743364691734314, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.053467, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049924, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.050299, "step": 18870, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016113, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030151, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000173, "vq_loss_layer_008": 0.000137, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000186, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000267, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000217, "vq_loss_layer_025": 0.000237, "vq_loss_layer_026": 0.000366, "vq_loss_layer_027": 0.000422, "vq_loss_layer_028": 0.000603, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.256018, "epoch": 0.01888, "grad_norm": 0.0010463980725035071, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049799, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050198, "step": 18880, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.0002, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.00025, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.001602, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.334328, "epoch": 0.01889, "grad_norm": 0.0010569066507741809, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.099121, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094727, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.04971, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.050092, "step": 18890, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.017334, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.02417, "value_mse_loss_layer_024": 0.026733, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.028198, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.047119, "value_mse_loss_layer_030": 0.051514, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000135, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000136, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000246, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000439, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000755, "vq_loss_layer_030": 0.001923, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.316019, "epoch": 0.0189, "grad_norm": 0.0011984590673819184, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.04967, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.050061, "step": 18900, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014221, "value_mse_loss_layer_010": 0.011597, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013184, "value_mse_loss_layer_013": 0.014404, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.01709, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.032715, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.042725, "value_mse_loss_layer_028": 0.047119, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.060547, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000171, "vq_loss_layer_009": 0.000178, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000332, "vq_loss_layer_016": 0.000322, "vq_loss_layer_017": 0.000241, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000159, "vq_loss_layer_021": 0.000256, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000171, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000256, "vq_loss_layer_026": 0.000357, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000908, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.322532, "epoch": 0.01891, "grad_norm": 0.0011041450779885054, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049261, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.049652, "step": 18910, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000307, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000469, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.0009, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.291276, "epoch": 0.01892, "grad_norm": 0.001193226664327085, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049622, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050037, "step": 18920, "value_mse_loss_layer_000": 0.000381, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000257, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000414, "vq_loss_layer_016": 0.000362, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000225, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.00023, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000412, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000881, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.296878, "epoch": 0.01893, "grad_norm": 0.0010074819438159466, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.053223, "key_mse_loss_layer_004": 0.059326, "key_mse_loss_layer_005": 0.063477, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.101074, "key_mse_loss_layer_016": 0.09375, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049753, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.05014, "step": 18930, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.00014, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000187, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000209, "vq_loss_layer_024": 0.000211, "vq_loss_layer_025": 0.000248, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000591, "vq_loss_layer_029": 0.000893, "vq_loss_layer_030": 0.001862, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.336608, "epoch": 0.01894, "grad_norm": 0.0012650762218981981, "key_mse_loss_layer_000": 0.003601, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.057861, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.094238, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.083008, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.059326, "kv_mse_loss": 0.04968, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050089, "step": 18940, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001038, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007294, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000199, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.000303, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000368, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.00024, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000353, "vq_loss_layer_025": 0.000357, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000912, "vq_loss_layer_029": 0.000885, "vq_loss_layer_030": 0.002747, "vq_loss_layer_031": 0.003693 }, { "ce_loss": 2.332373, "epoch": 0.01895, "grad_norm": 0.0010289497440680861, "key_mse_loss_layer_000": 0.00296, "key_mse_loss_layer_001": 0.009766, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.080078, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.050125, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050531, "step": 18950, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 3.9e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.2e-05, "vq_loss_layer_007": 0.000133, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000217, "vq_loss_layer_024": 0.000204, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000671, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.335032, "epoch": 0.01896, "grad_norm": 0.0011038483353331685, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.122559, "key_mse_loss_layer_014": 0.120117, "key_mse_loss_layer_015": 0.108887, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.091309, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049564, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.049945, "step": 18960, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.00708, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006073, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.02478, "value_mse_loss_layer_024": 0.026978, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.048096, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000236, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000221, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000809, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.33039, "epoch": 0.01897, "grad_norm": 0.0011049810564145446, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.053955, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049878, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050266, "step": 18970, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000137, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.000717, "vq_loss_layer_029": 0.000782, "vq_loss_layer_030": 0.001717, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.303538, "epoch": 0.01898, "grad_norm": 0.0011579337297007442, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.08252, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049414, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049799, "step": 18980, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000132, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000197, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000378, "vq_loss_layer_015": 0.000359, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000305, "vq_loss_layer_018": 0.000216, "vq_loss_layer_019": 0.000181, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.00034, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000273, "vq_loss_layer_025": 0.000385, "vq_loss_layer_026": 0.000504, "vq_loss_layer_027": 0.000565, "vq_loss_layer_028": 0.001045, "vq_loss_layer_029": 0.001015, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003586 }, { "ce_loss": 2.349848, "epoch": 0.01899, "grad_norm": 0.0009914280381053686, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.089844, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.086914, "key_mse_loss_layer_023": 0.083984, "key_mse_loss_layer_024": 0.067871, "key_mse_loss_layer_025": 0.064941, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.07666, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049582, "kv_vq_loss": 0.000379, "learning_rate": 0.001, "loss": 0.049942, "step": 18990, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000219, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000736, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002853 }, { "ce_loss": 2.318508, "epoch": 0.019, "grad_norm": 0.0011516636004671454, "key_mse_loss_layer_000": 0.003845, "key_mse_loss_layer_001": 0.011597, "key_mse_loss_layer_002": 0.061523, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.064453, "key_mse_loss_layer_006": 0.070801, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.089355, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.049524, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049893, "step": 19000, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.001022, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007751, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006653, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.01416, "value_mse_loss_layer_010": 0.011597, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014465, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.016235, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.018921, "value_mse_loss_layer_021": 0.021362, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.049072, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 2.6e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000277, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000241, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000173, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.00033, "vq_loss_layer_025": 0.00038, "vq_loss_layer_026": 0.000488, "vq_loss_layer_027": 0.000637, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.001129, "vq_loss_layer_030": 0.002991, "vq_loss_layer_031": 0.004181 }, { "ce_loss": 2.309083, "epoch": 0.01901, "grad_norm": 0.0010999912628903985, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.042969, "key_mse_loss_layer_005": 0.054932, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.089355, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.082031, "key_mse_loss_layer_020": 0.089355, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.087402, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.074219, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.061279, "kv_mse_loss": 0.049869, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.050259, "step": 19010, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.007996, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.014709, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 8.2e-05, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000183, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.00016, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000238, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000721, "vq_loss_layer_029": 0.000847, "vq_loss_layer_030": 0.001572, "vq_loss_layer_031": 0.003006 }, { "ce_loss": 2.337661, "epoch": 0.01902, "grad_norm": 0.0010670736664906144, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049176, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049564, "step": 19020, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000136, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000172, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000263, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000198, "vq_loss_layer_025": 0.000216, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000385, "vq_loss_layer_028": 0.000587, "vq_loss_layer_029": 0.000744, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002365 }, { "ce_loss": 2.30658, "epoch": 0.01903, "grad_norm": 0.0011723593343049288, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.048096, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084473, "key_mse_loss_layer_024": 0.066406, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080078, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049435, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049823, "step": 19030, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029175, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000227, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000645, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.001625, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.337415, "epoch": 0.01904, "grad_norm": 0.0011542383581399918, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.058105, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.072754, "kv_mse_loss": 0.049728, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050113, "step": 19040, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.007996, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000181, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000235, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.000637, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.002029, "vq_loss_layer_031": 0.002686 }, { "ce_loss": 2.303993, "epoch": 0.01905, "grad_norm": 0.0010787242790684104, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.045654, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057373, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049399, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049786, "step": 19050, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010437, "value_mse_loss_layer_009": 0.014343, "value_mse_loss_layer_010": 0.011597, "value_mse_loss_layer_011": 0.012146, "value_mse_loss_layer_012": 0.013245, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000118, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000179, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000188, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.001152, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.309497, "epoch": 0.01906, "grad_norm": 0.0009903113823384047, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.105469, "key_mse_loss_layer_017": 0.105957, "key_mse_loss_layer_018": 0.112793, "key_mse_loss_layer_019": 0.093262, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.100098, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.100586, "key_mse_loss_layer_024": 0.080078, "key_mse_loss_layer_025": 0.076172, "key_mse_loss_layer_026": 0.089355, "key_mse_loss_layer_027": 0.087402, "key_mse_loss_layer_028": 0.095215, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.095703, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.0496, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.049991, "step": 19060, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013245, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.016724, "value_mse_loss_layer_016": 0.01355, "value_mse_loss_layer_017": 0.01709, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 5.3e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000243, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000309, "vq_loss_layer_016": 0.000278, "vq_loss_layer_017": 0.000256, "vq_loss_layer_018": 0.000166, "vq_loss_layer_019": 0.000144, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000239, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000454, "vq_loss_layer_028": 0.000774, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.002625 }, { "ce_loss": 2.30692, "epoch": 0.01907, "grad_norm": 0.0011762833455577493, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049677, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050052, "step": 19070, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.028442, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.4e-05, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000322, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.00016, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000189, "vq_loss_layer_023": 0.00025, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000376, "vq_loss_layer_027": 0.000418, "vq_loss_layer_028": 0.000916, "vq_loss_layer_029": 0.000839, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.288035, "epoch": 0.01908, "grad_norm": 0.0010985835688188672, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086914, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049576, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049969, "step": 19080, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008362, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000135, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000156, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000214, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000736, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002945 }, { "ce_loss": 2.304002, "epoch": 0.01909, "grad_norm": 0.0012053196551278234, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102539, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.049408, "kv_vq_loss": 0.000375, "learning_rate": 0.001, "loss": 0.049762, "step": 19090, "value_mse_loss_layer_000": 0.000353, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.007996, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.000244, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000471, "vq_loss_layer_028": 0.000652, "vq_loss_layer_029": 0.000793, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.344017, "epoch": 0.0191, "grad_norm": 0.001038050395436585, "key_mse_loss_layer_000": 0.002182, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043457, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.094727, "key_mse_loss_layer_010": 0.11084, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.07959, "key_mse_loss_layer_013": 0.143555, "key_mse_loss_layer_014": 0.136719, "key_mse_loss_layer_015": 0.124512, "key_mse_loss_layer_016": 0.117188, "key_mse_loss_layer_017": 0.117676, "key_mse_loss_layer_018": 0.121582, "key_mse_loss_layer_019": 0.099609, "key_mse_loss_layer_020": 0.117188, "key_mse_loss_layer_021": 0.108398, "key_mse_loss_layer_022": 0.111816, "key_mse_loss_layer_023": 0.109375, "key_mse_loss_layer_024": 0.083496, "key_mse_loss_layer_025": 0.080078, "key_mse_loss_layer_026": 0.093262, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.049597, "kv_vq_loss": 0.000372, "learning_rate": 0.001, "loss": 0.049945, "step": 19100, "value_mse_loss_layer_000": 0.000326, "value_mse_loss_layer_001": 0.000954, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007477, "value_mse_loss_layer_005": 0.006714, "value_mse_loss_layer_006": 0.008789, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.015442, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.015564, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013428, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.014465, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.020996, "value_mse_loss_layer_022": 0.019653, "value_mse_loss_layer_023": 0.022949, "value_mse_loss_layer_024": 0.025024, "value_mse_loss_layer_025": 0.031982, "value_mse_loss_layer_026": 0.027222, "value_mse_loss_layer_027": 0.033203, "value_mse_loss_layer_028": 0.038086, "value_mse_loss_layer_029": 0.042725, "value_mse_loss_layer_030": 0.049316, "value_mse_loss_layer_031": 0.044678, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.6e-05, "vq_loss_layer_003": 3.5e-05, "vq_loss_layer_004": 6e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000368, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000273, "vq_loss_layer_017": 0.00032, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000171, "vq_loss_layer_020": 0.000236, "vq_loss_layer_021": 0.00037, "vq_loss_layer_022": 0.000269, "vq_loss_layer_023": 0.000336, "vq_loss_layer_024": 0.000298, "vq_loss_layer_025": 0.000496, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.000668, "vq_loss_layer_030": 0.001595, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.323749, "epoch": 0.01911, "grad_norm": 0.001106308656744659, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049561, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049936, "step": 19110, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000319, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000278, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000239, "vq_loss_layer_026": 0.000395, "vq_loss_layer_027": 0.00042, "vq_loss_layer_028": 0.000778, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.335788, "epoch": 0.01912, "grad_norm": 0.001156660495325923, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.052002, "key_mse_loss_layer_004": 0.060303, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.068848, "key_mse_loss_layer_013": 0.108398, "key_mse_loss_layer_014": 0.10498, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086426, "key_mse_loss_layer_017": 0.09082, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049316, "kv_vq_loss": 0.000374, "learning_rate": 0.001, "loss": 0.04968, "step": 19120, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.00383, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.00647, "value_mse_loss_layer_005": 0.00589, "value_mse_loss_layer_006": 0.007874, "value_mse_loss_layer_007": 0.008179, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 2e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 3.9e-05, "vq_loss_layer_005": 4.4e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000128, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000143, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000439, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000167, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000161, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000213, "vq_loss_layer_024": 0.000195, "vq_loss_layer_025": 0.000209, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.002213 }, { "ce_loss": 2.34087, "epoch": 0.01913, "grad_norm": 0.0010565767297521234, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.106934, "key_mse_loss_layer_015": 0.096191, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.083008, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049258, "kv_vq_loss": 0.00038, "learning_rate": 0.001, "loss": 0.049615, "step": 19130, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000108, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000225, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001694, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.355659, "epoch": 0.01914, "grad_norm": 0.001147435512393713, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097168, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.09082, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.06543, "key_mse_loss_layer_026": 0.07373, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.07959, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049374, "kv_vq_loss": 0.000378, "learning_rate": 0.001, "loss": 0.049753, "step": 19140, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006104, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008301, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000134, "vq_loss_layer_008": 0.00013, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000175, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000299, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000164, "vq_loss_layer_019": 0.000136, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000203, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000202, "vq_loss_layer_025": 0.000233, "vq_loss_layer_026": 0.000341, "vq_loss_layer_027": 0.000443, "vq_loss_layer_028": 0.000553, "vq_loss_layer_029": 0.000698, "vq_loss_layer_030": 0.001595, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.263398, "epoch": 0.01915, "grad_norm": 0.0010798468720167875, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.086426, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049683, "kv_vq_loss": 0.0004, "learning_rate": 0.001, "loss": 0.05011, "step": 19150, "value_mse_loss_layer_000": 0.000359, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.027466, "value_mse_loss_layer_024": 0.030762, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031982, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.054443, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000319, "vq_loss_layer_017": 0.000338, "vq_loss_layer_018": 0.000202, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000219, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.00041, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.00209, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.313046, "epoch": 0.01916, "grad_norm": 0.0011863491963595152, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.049292, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049667, "step": 19160, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.00647, "value_mse_loss_layer_005": 0.006134, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000284, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.00116, "vq_loss_layer_029": 0.001488, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.003754 }, { "ce_loss": 2.322403, "epoch": 0.01917, "grad_norm": 0.0010878676548600197, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.059814, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.102539, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.130859, "key_mse_loss_layer_014": 0.12793, "key_mse_loss_layer_015": 0.114746, "key_mse_loss_layer_016": 0.105957, "key_mse_loss_layer_017": 0.107422, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.09375, "key_mse_loss_layer_020": 0.106445, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.098633, "key_mse_loss_layer_023": 0.09668, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049323, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.04971, "step": 19170, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015564, "value_mse_loss_layer_010": 0.012695, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021118, "value_mse_loss_layer_022": 0.020386, "value_mse_loss_layer_023": 0.023438, "value_mse_loss_layer_024": 0.026489, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.02771, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.039307, "value_mse_loss_layer_029": 0.044922, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.9e-05, "vq_loss_layer_005": 6.3e-05, "vq_loss_layer_006": 0.000121, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000183, "vq_loss_layer_009": 0.000269, "vq_loss_layer_010": 0.000215, "vq_loss_layer_011": 0.000218, "vq_loss_layer_012": 0.000372, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000391, "vq_loss_layer_015": 0.000452, "vq_loss_layer_016": 0.000364, "vq_loss_layer_017": 0.000349, "vq_loss_layer_018": 0.000207, "vq_loss_layer_019": 0.000192, "vq_loss_layer_020": 0.000256, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000284, "vq_loss_layer_023": 0.000315, "vq_loss_layer_024": 0.000345, "vq_loss_layer_025": 0.000576, "vq_loss_layer_026": 0.000599, "vq_loss_layer_027": 0.00066, "vq_loss_layer_028": 0.000896, "vq_loss_layer_029": 0.000954, "vq_loss_layer_030": 0.002869, "vq_loss_layer_031": 0.003906 }, { "ce_loss": 2.297432, "epoch": 0.01918, "grad_norm": 0.0014313405845314264, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.049805, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.074707, "key_mse_loss_layer_027": 0.074219, "key_mse_loss_layer_028": 0.081055, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.04968, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050098, "step": 19180, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.01239, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.018921, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000188, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000393, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000159, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000427, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.002533, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.30555, "epoch": 0.01919, "grad_norm": 0.0011000059312209487, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.055664, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049713, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050128, "step": 19190, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.00705, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000142, "vq_loss_layer_009": 0.000179, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000311, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000235, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000429, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000729, "vq_loss_layer_029": 0.000816, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.32917, "epoch": 0.0192, "grad_norm": 0.0010504130041226745, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.098145, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049426, "kv_vq_loss": 0.000372, "learning_rate": 0.001, "loss": 0.049789, "step": 19200, "value_mse_loss_layer_000": 0.000355, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.00705, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029175, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000155, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000389, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000347, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000214, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.000273, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.00034, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000725, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001839, "vq_loss_layer_031": 0.002777 }, { "ce_loss": 2.308672, "epoch": 0.01921, "grad_norm": 0.001173298223875463, "key_mse_loss_layer_000": 0.003174, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057617, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.061523, "key_mse_loss_layer_005": 0.064941, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049765, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.050162, "step": 19210, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.0065, "value_mse_loss_layer_005": 0.006104, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023804, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000389, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000343, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000163, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.000364, "vq_loss_layer_022": 0.000254, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000294, "vq_loss_layer_026": 0.000475, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.318181, "epoch": 0.01922, "grad_norm": 0.0012324939016252756, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010437, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.050537, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.086914, "key_mse_loss_layer_030": 0.091309, "key_mse_loss_layer_031": 0.079102, "kv_mse_loss": 0.049277, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.04967, "step": 19220, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.024048, "value_mse_loss_layer_023": 0.0271, "value_mse_loss_layer_024": 0.03064, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041504, "value_mse_loss_layer_028": 0.046875, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.059082, "value_mse_loss_layer_031": 0.048096, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000183, "vq_loss_layer_021": 0.000275, "vq_loss_layer_022": 0.00025, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000301, "vq_loss_layer_025": 0.000324, "vq_loss_layer_026": 0.000534, "vq_loss_layer_027": 0.000645, "vq_loss_layer_028": 0.000919, "vq_loss_layer_029": 0.001396, "vq_loss_layer_030": 0.002411, "vq_loss_layer_031": 0.003876 }, { "ce_loss": 2.305171, "epoch": 0.01923, "grad_norm": 0.0010972386226058006, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.057617, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.074707, "kv_mse_loss": 0.04989, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.050296, "step": 19230, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006134, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043213, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000112, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000193, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000197, "vq_loss_layer_025": 0.000242, "vq_loss_layer_026": 0.000364, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001564, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.312212, "epoch": 0.01924, "grad_norm": 0.0011638811556622386, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.076172, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049634, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050024, "step": 19240, "value_mse_loss_layer_000": 0.00036, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003799, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006622, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012207, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.045166, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000152, "vq_loss_layer_011": 0.000167, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000282, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000174, "vq_loss_layer_019": 0.000138, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.000195, "vq_loss_layer_025": 0.000232, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.00069, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.002533 }, { "ce_loss": 2.289514, "epoch": 0.01925, "grad_norm": 0.0012006391771137714, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.05127, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.086426, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.115234, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049637, "kv_vq_loss": 0.000397, "learning_rate": 0.001, "loss": 0.050043, "step": 19250, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.025146, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.4e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000185, "vq_loss_layer_011": 0.000203, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000213, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000267, "vq_loss_layer_025": 0.000303, "vq_loss_layer_026": 0.000483, "vq_loss_layer_027": 0.000526, "vq_loss_layer_028": 0.00074, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002304, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.316717, "epoch": 0.01926, "grad_norm": 0.0010195871582254767, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085449, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.097656, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.073242, "kv_mse_loss": 0.049286, "kv_vq_loss": 0.00038, "learning_rate": 0.001, "loss": 0.049673, "step": 19260, "value_mse_loss_layer_000": 0.000376, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.011597, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000182, "vq_loss_layer_010": 0.000154, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000193, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000237, "vq_loss_layer_024": 0.000204, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000786, "vq_loss_layer_030": 0.001808, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.353126, "epoch": 0.01927, "grad_norm": 0.001174882985651493, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.067383, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.073242, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.077637, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049429, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.049847, "step": 19270, "value_mse_loss_layer_000": 0.000355, "value_mse_loss_layer_001": 0.000973, "value_mse_loss_layer_002": 0.003754, "value_mse_loss_layer_003": 0.006989, "value_mse_loss_layer_004": 0.006409, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.014954, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.021973, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027222, "value_mse_loss_layer_025": 0.031982, "value_mse_loss_layer_026": 0.02832, "value_mse_loss_layer_027": 0.036377, "value_mse_loss_layer_028": 0.041504, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000155, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000234, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000414, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000748, "vq_loss_layer_030": 0.001572, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.372457, "epoch": 0.01928, "grad_norm": 0.0009770032484084368, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.051758, "key_mse_loss_layer_004": 0.060547, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.111816, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092285, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.080566, "kv_mse_loss": 0.049591, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049982, "step": 19280, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.00705, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.005981, "value_mse_loss_layer_006": 0.007751, "value_mse_loss_layer_007": 0.00824, "value_mse_loss_layer_008": 0.010437, "value_mse_loss_layer_009": 0.014099, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012085, "value_mse_loss_layer_012": 0.013123, "value_mse_loss_layer_013": 0.014404, "value_mse_loss_layer_014": 0.015198, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.000158, "vq_loss_layer_011": 0.000168, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000315, "vq_loss_layer_015": 0.000322, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000261, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000133, "vq_loss_layer_020": 0.000155, "vq_loss_layer_021": 0.000271, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000209, "vq_loss_layer_024": 0.000204, "vq_loss_layer_025": 0.00023, "vq_loss_layer_026": 0.000353, "vq_loss_layer_027": 0.000408, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000694, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.002274 }, { "ce_loss": 2.338191, "epoch": 0.01929, "grad_norm": 0.001169978640973568, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.098145, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.082031, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049771, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.050156, "step": 19290, "value_mse_loss_layer_000": 0.000355, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.00383, "value_mse_loss_layer_003": 0.007019, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011841, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000199, "vq_loss_layer_010": 0.000155, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000234, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000261, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002975 }, { "ce_loss": 2.3002, "epoch": 0.0193, "grad_norm": 0.0010315014515072107, "key_mse_loss_layer_000": 0.003357, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.04248, "key_mse_loss_layer_005": 0.05542, "key_mse_loss_layer_006": 0.062256, "key_mse_loss_layer_007": 0.071289, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.082031, "key_mse_loss_layer_010": 0.09375, "key_mse_loss_layer_011": 0.09375, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.106934, "key_mse_loss_layer_014": 0.104004, "key_mse_loss_layer_015": 0.091797, "key_mse_loss_layer_016": 0.084473, "key_mse_loss_layer_017": 0.088379, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.081543, "key_mse_loss_layer_020": 0.086914, "key_mse_loss_layer_021": 0.084473, "key_mse_loss_layer_022": 0.086426, "key_mse_loss_layer_023": 0.084961, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049515, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049896, "step": 19300, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.030518, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.057617, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 3.7e-05, "vq_loss_layer_005": 4.7e-05, "vq_loss_layer_006": 8.3e-05, "vq_loss_layer_007": 0.000125, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000177, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000271, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000242, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.00015, "vq_loss_layer_021": 0.000248, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000207, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000269, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.001038, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003265 }, { "ce_loss": 2.340416, "epoch": 0.01931, "grad_norm": 0.001077887718565762, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.052246, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.045654, "key_mse_loss_layer_005": 0.057129, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.121094, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.103516, "key_mse_loss_layer_016": 0.095703, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.090332, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049683, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.050089, "step": 19310, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.021606, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000179, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000359, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000328, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000199, "vq_loss_layer_021": 0.000296, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000288, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000425, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001579, "vq_loss_layer_031": 0.002655 }, { "ce_loss": 2.292223, "epoch": 0.01932, "grad_norm": 0.0011218086583539844, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.04985, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050229, "step": 19320, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000201, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000209, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000243, "vq_loss_layer_024": 0.000201, "vq_loss_layer_025": 0.000244, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.00045, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000771, "vq_loss_layer_030": 0.00161, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.313589, "epoch": 0.01933, "grad_norm": 0.0010842448100447655, "key_mse_loss_layer_000": 0.003143, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.044434, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081055, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049829, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050204, "step": 19330, "value_mse_loss_layer_000": 0.000353, "value_mse_loss_layer_001": 0.000973, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.027588, "value_mse_loss_layer_024": 0.03125, "value_mse_loss_layer_025": 0.036377, "value_mse_loss_layer_026": 0.033691, "value_mse_loss_layer_027": 0.042236, "value_mse_loss_layer_028": 0.04834, "value_mse_loss_layer_029": 0.055908, "value_mse_loss_layer_030": 0.060791, "value_mse_loss_layer_031": 0.050049, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 4.7e-05, "vq_loss_layer_006": 8.7e-05, "vq_loss_layer_007": 0.000131, "vq_loss_layer_008": 0.000152, "vq_loss_layer_009": 0.000186, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000267, "vq_loss_layer_014": 0.000311, "vq_loss_layer_015": 0.000334, "vq_loss_layer_016": 0.000317, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000256, "vq_loss_layer_019": 0.000273, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000234, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000292, "vq_loss_layer_026": 0.000441, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.001282, "vq_loss_layer_029": 0.001709, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.003448 }, { "ce_loss": 2.272369, "epoch": 0.01934, "grad_norm": 0.0011265507200732827, "key_mse_loss_layer_000": 0.003311, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.048828, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118652, "key_mse_loss_layer_014": 0.114746, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103027, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080566, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.04967, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050082, "step": 19340, "value_mse_loss_layer_000": 0.000378, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012268, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.028809, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000194, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.000378, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000317, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000189, "vq_loss_layer_021": 0.000343, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000263, "vq_loss_layer_024": 0.000256, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000481, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.00082, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.003006 }, { "ce_loss": 2.32435, "epoch": 0.01935, "grad_norm": 0.0011161844013258815, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105957, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091309, "key_mse_loss_layer_018": 0.095703, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.049329, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.049716, "step": 19350, "value_mse_loss_layer_000": 0.000349, "value_mse_loss_layer_001": 0.000969, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.0354, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00015, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000177, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000387, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.00018, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000341, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000298, "vq_loss_layer_024": 0.00034, "vq_loss_layer_025": 0.000429, "vq_loss_layer_026": 0.000587, "vq_loss_layer_027": 0.000854, "vq_loss_layer_028": 0.001549, "vq_loss_layer_029": 0.002686, "vq_loss_layer_030": 0.003372, "vq_loss_layer_031": 0.00473 }, { "ce_loss": 2.323631, "epoch": 0.01936, "grad_norm": 0.0010188240557909012, "key_mse_loss_layer_000": 0.003082, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.087402, "key_mse_loss_layer_009": 0.092773, "key_mse_loss_layer_010": 0.10498, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.125, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.109375, "key_mse_loss_layer_016": 0.103027, "key_mse_loss_layer_017": 0.104004, "key_mse_loss_layer_018": 0.11084, "key_mse_loss_layer_019": 0.092285, "key_mse_loss_layer_020": 0.103516, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.100586, "key_mse_loss_layer_023": 0.097656, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.086914, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.09375, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.092773, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049646, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050052, "step": 19360, "value_mse_loss_layer_000": 0.00036, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007416, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016846, "value_mse_loss_layer_016": 0.013794, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.025024, "value_mse_loss_layer_024": 0.02771, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000144, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000322, "vq_loss_layer_018": 0.0002, "vq_loss_layer_019": 0.000167, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.00033, "vq_loss_layer_022": 0.000256, "vq_loss_layer_023": 0.000305, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000364, "vq_loss_layer_026": 0.00053, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000889, "vq_loss_layer_029": 0.000889, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002899 }, { "ce_loss": 2.326957, "epoch": 0.01937, "grad_norm": 0.0011319371405988932, "key_mse_loss_layer_000": 0.003052, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.046143, "key_mse_loss_layer_004": 0.046387, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.123047, "key_mse_loss_layer_014": 0.119141, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.100098, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.106934, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.098145, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049768, "kv_vq_loss": 0.000403, "learning_rate": 0.001, "loss": 0.050198, "step": 19370, "value_mse_loss_layer_000": 0.000353, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012207, "value_mse_loss_layer_012": 0.013184, "value_mse_loss_layer_013": 0.014709, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.016724, "value_mse_loss_layer_016": 0.013489, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.014893, "value_mse_loss_layer_019": 0.017944, "value_mse_loss_layer_020": 0.018921, "value_mse_loss_layer_021": 0.020874, "value_mse_loss_layer_022": 0.021606, "value_mse_loss_layer_023": 0.024048, "value_mse_loss_layer_024": 0.027344, "value_mse_loss_layer_025": 0.031128, "value_mse_loss_layer_026": 0.027588, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.047119, "value_mse_loss_layer_030": 0.053223, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000186, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000193, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000244, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000353, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000146, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000162, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000204, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000286, "vq_loss_layer_026": 0.000389, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000767, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.365623, "epoch": 0.01938, "grad_norm": 0.0012558329617604613, "key_mse_loss_layer_000": 0.002945, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.06543, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.080078, "key_mse_loss_layer_008": 0.090332, "key_mse_loss_layer_009": 0.09375, "key_mse_loss_layer_010": 0.108887, "key_mse_loss_layer_011": 0.10498, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.121094, "key_mse_loss_layer_015": 0.109863, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.10791, "key_mse_loss_layer_019": 0.091797, "key_mse_loss_layer_020": 0.103027, "key_mse_loss_layer_021": 0.097168, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.04938, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049771, "step": 19380, "value_mse_loss_layer_000": 0.000343, "value_mse_loss_layer_001": 0.000969, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008728, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.013977, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.02417, "value_mse_loss_layer_024": 0.026245, "value_mse_loss_layer_025": 0.031738, "value_mse_loss_layer_026": 0.027954, "value_mse_loss_layer_027": 0.0354, "value_mse_loss_layer_028": 0.039795, "value_mse_loss_layer_029": 0.045166, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5.6e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000134, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000163, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000209, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000353, "vq_loss_layer_017": 0.000336, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.00017, "vq_loss_layer_020": 0.000271, "vq_loss_layer_021": 0.000362, "vq_loss_layer_022": 0.000286, "vq_loss_layer_023": 0.000303, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000406, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000618, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.002991 }, { "ce_loss": 2.303294, "epoch": 0.01939, "grad_norm": 0.0010224689031019807, "key_mse_loss_layer_000": 0.003464, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.056396, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.071777, "key_mse_loss_layer_008": 0.079102, "key_mse_loss_layer_009": 0.08252, "key_mse_loss_layer_010": 0.094727, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.069336, "key_mse_loss_layer_013": 0.106445, "key_mse_loss_layer_014": 0.103027, "key_mse_loss_layer_015": 0.092773, "key_mse_loss_layer_016": 0.084961, "key_mse_loss_layer_017": 0.088867, "key_mse_loss_layer_018": 0.094727, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.090332, "key_mse_loss_layer_021": 0.085938, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.079102, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.07373, "key_mse_loss_layer_031": 0.060303, "kv_mse_loss": 0.049271, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049631, "step": 19390, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007599, "value_mse_loss_layer_004": 0.007355, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015198, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014587, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.046387, "value_mse_loss_layer_029": 0.054199, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.04834, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 8.4e-05, "vq_loss_layer_007": 0.000122, "vq_loss_layer_008": 0.000151, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000286, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000177, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000227, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000277, "vq_loss_layer_025": 0.000341, "vq_loss_layer_026": 0.000492, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.001205, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.003418 }, { "ce_loss": 2.318629, "epoch": 0.0194, "grad_norm": 0.0012456532567739487, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049231, "kv_vq_loss": 0.000379, "learning_rate": 0.001, "loss": 0.049591, "step": 19400, "value_mse_loss_layer_000": 0.000355, "value_mse_loss_layer_001": 0.000969, "value_mse_loss_layer_002": 0.003784, "value_mse_loss_layer_003": 0.006989, "value_mse_loss_layer_004": 0.006531, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008362, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038818, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.045166, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000136, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000179, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000187, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000182, "vq_loss_layer_019": 0.000141, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.00023, "vq_loss_layer_023": 0.000282, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000401, "vq_loss_layer_027": 0.000479, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000687, "vq_loss_layer_030": 0.002563, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.333324, "epoch": 0.01941, "grad_norm": 0.001018631854094565, "key_mse_loss_layer_000": 0.003479, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083984, "key_mse_loss_layer_027": 0.084473, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.086914, "key_mse_loss_layer_031": 0.067383, "kv_mse_loss": 0.049545, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049939, "step": 19410, "value_mse_loss_layer_000": 0.000374, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.00708, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008484, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.013855, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030151, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.054932, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000104, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000198, "vq_loss_layer_021": 0.00036, "vq_loss_layer_022": 0.000259, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000484, "vq_loss_layer_028": 0.000782, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001778, "vq_loss_layer_031": 0.00296 }, { "ce_loss": 2.285396, "epoch": 0.01942, "grad_norm": 0.001002717763185501, "key_mse_loss_layer_000": 0.002823, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.09082, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.104492, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.108887, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.095703, "key_mse_loss_layer_023": 0.095215, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.061523, "kv_mse_loss": 0.049176, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.04957, "step": 19420, "value_mse_loss_layer_000": 0.000341, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007416, "value_mse_loss_layer_005": 0.006775, "value_mse_loss_layer_006": 0.008667, "value_mse_loss_layer_007": 0.009033, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.014099, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016113, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.022705, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.046875, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.2e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000148, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.000204, "vq_loss_layer_010": 0.000191, "vq_loss_layer_011": 0.000197, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000372, "vq_loss_layer_015": 0.000376, "vq_loss_layer_016": 0.00034, "vq_loss_layer_017": 0.000313, "vq_loss_layer_018": 0.000204, "vq_loss_layer_019": 0.000186, "vq_loss_layer_020": 0.000213, "vq_loss_layer_021": 0.000334, "vq_loss_layer_022": 0.000231, "vq_loss_layer_023": 0.000257, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000433, "vq_loss_layer_027": 0.000452, "vq_loss_layer_028": 0.000835, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001762, "vq_loss_layer_031": 0.003204 }, { "ce_loss": 2.304023, "epoch": 0.01943, "grad_norm": 0.0011799593921750784, "key_mse_loss_layer_000": 0.003021, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.062256, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.100098, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.100586, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.09668, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.074219, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.087891, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049335, "kv_vq_loss": 0.00039, "learning_rate": 0.001, "loss": 0.049741, "step": 19430, "value_mse_loss_layer_000": 0.000359, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012451, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015564, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.049561, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000109, "vq_loss_layer_007": 0.000159, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000301, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000235, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000216, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000441, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000698, "vq_loss_layer_030": 0.001915, "vq_loss_layer_031": 0.002258 }, { "ce_loss": 2.289204, "epoch": 0.01944, "grad_norm": 0.0010049373377114534, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.043945, "key_mse_loss_layer_004": 0.040527, "key_mse_loss_layer_005": 0.054199, "key_mse_loss_layer_006": 0.061035, "key_mse_loss_layer_007": 0.069824, "key_mse_loss_layer_008": 0.07959, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.094238, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.120605, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.105469, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.100098, "key_mse_loss_layer_021": 0.096191, "key_mse_loss_layer_022": 0.098145, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.075684, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.058838, "kv_mse_loss": 0.04938, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049771, "step": 19440, "value_mse_loss_layer_000": 0.000359, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015137, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013672, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.014587, "value_mse_loss_layer_019": 0.017334, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027222, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.052734, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 9e-06, "vq_loss_layer_002": 1.3e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.000139, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000196, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.000381, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000347, "vq_loss_layer_017": 0.000301, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000178, "vq_loss_layer_020": 0.00021, "vq_loss_layer_021": 0.000397, "vq_loss_layer_022": 0.000246, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000961, "vq_loss_layer_029": 0.000957, "vq_loss_layer_030": 0.001816, "vq_loss_layer_031": 0.003479 }, { "ce_loss": 2.309832, "epoch": 0.01945, "grad_norm": 0.0012631909921765327, "key_mse_loss_layer_000": 0.003036, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.046631, "key_mse_loss_layer_005": 0.056885, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.080566, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.108887, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089355, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.083496, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049512, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049905, "step": 19450, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028687, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.045654, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.05542, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.6e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000193, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000341, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000303, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000161, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000182, "vq_loss_layer_021": 0.000313, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000208, "vq_loss_layer_024": 0.000209, "vq_loss_layer_025": 0.000257, "vq_loss_layer_026": 0.000387, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.002228, "vq_loss_layer_031": 0.002731 }, { "ce_loss": 2.273293, "epoch": 0.01946, "grad_norm": 0.0010901953792199492, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.058105, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.050049, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.069824, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.093262, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.078613, "key_mse_loss_layer_013": 0.125977, "key_mse_loss_layer_014": 0.123047, "key_mse_loss_layer_015": 0.113281, "key_mse_loss_layer_016": 0.10498, "key_mse_loss_layer_017": 0.104492, "key_mse_loss_layer_018": 0.111816, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.104492, "key_mse_loss_layer_021": 0.098633, "key_mse_loss_layer_022": 0.102539, "key_mse_loss_layer_023": 0.099609, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.088379, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.094727, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.094727, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.04986, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050269, "step": 19460, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006927, "value_mse_loss_layer_005": 0.006561, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017334, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.02771, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.7e-05, "vq_loss_layer_006": 0.000122, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.000167, "vq_loss_layer_009": 0.000212, "vq_loss_layer_010": 0.000192, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.00037, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000166, "vq_loss_layer_020": 0.000197, "vq_loss_layer_021": 0.00032, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000267, "vq_loss_layer_024": 0.000275, "vq_loss_layer_025": 0.000391, "vq_loss_layer_026": 0.000511, "vq_loss_layer_027": 0.000557, "vq_loss_layer_028": 0.000832, "vq_loss_layer_029": 0.000916, "vq_loss_layer_030": 0.002075, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.263474, "epoch": 0.01947, "grad_norm": 0.0010288943303748965, "key_mse_loss_layer_000": 0.002914, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.046387, "key_mse_loss_layer_004": 0.04834, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.097656, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.076172, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.063965, "kv_mse_loss": 0.049667, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050061, "step": 19470, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000965, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014099, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.014954, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.021973, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.0271, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.028198, "value_mse_loss_layer_027": 0.036133, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.047119, "value_mse_loss_layer_030": 0.05127, "value_mse_loss_layer_031": 0.045166, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.7e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000169, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000178, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000362, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000191, "vq_loss_layer_020": 0.0002, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.000275, "vq_loss_layer_024": 0.000231, "vq_loss_layer_025": 0.000368, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000786, "vq_loss_layer_029": 0.000938, "vq_loss_layer_030": 0.00174, "vq_loss_layer_031": 0.002792 }, { "ce_loss": 2.305753, "epoch": 0.01948, "grad_norm": 0.001199823571369052, "key_mse_loss_layer_000": 0.003189, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.04541, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.076172, "key_mse_loss_layer_030": 0.075684, "key_mse_loss_layer_031": 0.05957, "kv_mse_loss": 0.049304, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.049667, "step": 19480, "value_mse_loss_layer_000": 0.00037, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.007019, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019165, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022339, "value_mse_loss_layer_023": 0.025391, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000176, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000195, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000399, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000248, "vq_loss_layer_023": 0.000294, "vq_loss_layer_024": 0.000332, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.000477, "vq_loss_layer_027": 0.000572, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003311 }, { "ce_loss": 2.286653, "epoch": 0.01949, "grad_norm": 0.0010688616894185543, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.120117, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.102051, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.09082, "key_mse_loss_layer_020": 0.102051, "key_mse_loss_layer_021": 0.09668, "key_mse_loss_layer_022": 0.099121, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.084961, "key_mse_loss_layer_030": 0.091797, "key_mse_loss_layer_031": 0.072266, "kv_mse_loss": 0.049945, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.050323, "step": 19490, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006866, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013, "value_mse_loss_layer_013": 0.014343, "value_mse_loss_layer_014": 0.014709, "value_mse_loss_layer_015": 0.016724, "value_mse_loss_layer_016": 0.013611, "value_mse_loss_layer_017": 0.01709, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.044678, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000292, "vq_loss_layer_013": 0.000237, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000307, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000257, "vq_loss_layer_018": 0.000149, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000206, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000195, "vq_loss_layer_023": 0.00019, "vq_loss_layer_024": 0.000232, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000404, "vq_loss_layer_028": 0.000626, "vq_loss_layer_029": 0.000683, "vq_loss_layer_030": 0.001793, "vq_loss_layer_031": 0.00235 }, { "ce_loss": 2.384092, "epoch": 0.0195, "grad_norm": 0.001248174929060042, "key_mse_loss_layer_000": 0.004181, "key_mse_loss_layer_001": 0.011169, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.045898, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.101562, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.073242, "key_mse_loss_layer_026": 0.083496, "key_mse_loss_layer_027": 0.087891, "key_mse_loss_layer_028": 0.09082, "key_mse_loss_layer_029": 0.089844, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049075, "kv_vq_loss": 0.000374, "learning_rate": 0.001, "loss": 0.049426, "step": 19500, "value_mse_loss_layer_000": 0.000387, "value_mse_loss_layer_001": 0.001053, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.00769, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.027344, "value_mse_loss_layer_024": 0.032471, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.034424, "value_mse_loss_layer_027": 0.044189, "value_mse_loss_layer_028": 0.048584, "value_mse_loss_layer_029": 0.059082, "value_mse_loss_layer_030": 0.064453, "value_mse_loss_layer_031": 0.051758, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000173, "vq_loss_layer_009": 0.000187, "vq_loss_layer_010": 0.000184, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000383, "vq_loss_layer_015": 0.000456, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000269, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000153, "vq_loss_layer_021": 0.000244, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.000193, "vq_loss_layer_024": 0.00029, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000486, "vq_loss_layer_028": 0.00106, "vq_loss_layer_029": 0.001053, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003021 }, { "ce_loss": 2.284636, "epoch": 0.01951, "grad_norm": 0.0011146386386826634, "key_mse_loss_layer_000": 0.002716, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.060547, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112793, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.075684, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.078613, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049533, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.049933, "step": 19510, "value_mse_loss_layer_000": 0.000341, "value_mse_loss_layer_001": 0.00095, "value_mse_loss_layer_002": 0.003769, "value_mse_loss_layer_003": 0.007111, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006104, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.037842, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.00011, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000171, "vq_loss_layer_011": 0.000181, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.00033, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000303, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000196, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000252, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000435, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001701, "vq_loss_layer_031": 0.002594 }, { "ce_loss": 2.282882, "epoch": 0.01952, "grad_norm": 0.0011506149312481284, "key_mse_loss_layer_000": 0.003616, "key_mse_loss_layer_001": 0.01062, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.077148, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.085938, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.067871, "kv_mse_loss": 0.049734, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050101, "step": 19520, "value_mse_loss_layer_000": 0.000359, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003998, "value_mse_loss_layer_003": 0.007538, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006744, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026978, "value_mse_loss_layer_024": 0.031738, "value_mse_loss_layer_025": 0.035645, "value_mse_loss_layer_026": 0.033203, "value_mse_loss_layer_027": 0.04248, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.056396, "value_mse_loss_layer_030": 0.061035, "value_mse_loss_layer_031": 0.049561, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 6e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.000141, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000184, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000288, "vq_loss_layer_013": 0.000263, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000237, "vq_loss_layer_018": 0.000206, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000146, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000191, "vq_loss_layer_024": 0.000326, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000425, "vq_loss_layer_027": 0.00053, "vq_loss_layer_028": 0.000805, "vq_loss_layer_029": 0.00103, "vq_loss_layer_030": 0.002182, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.309434, "epoch": 0.01953, "grad_norm": 0.0010263024596497416, "key_mse_loss_layer_000": 0.00293, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050293, "key_mse_loss_layer_004": 0.055176, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.07959, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.075195, "kv_mse_loss": 0.049634, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.050034, "step": 19530, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000973, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007019, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.01416, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018799, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.045166, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000319, "vq_loss_layer_022": 0.000232, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000238, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001945, "vq_loss_layer_031": 0.002884 }, { "ce_loss": 2.310112, "epoch": 0.01954, "grad_norm": 0.0010667535243555903, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.051025, "key_mse_loss_layer_004": 0.054199, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.095215, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.09082, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049362, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.049741, "step": 19540, "value_mse_loss_layer_000": 0.000355, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.012146, "value_mse_loss_layer_012": 0.013, "value_mse_loss_layer_013": 0.014343, "value_mse_loss_layer_014": 0.014954, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.013916, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022583, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000226, "vq_loss_layer_010": 0.00015, "vq_loss_layer_011": 0.000169, "vq_loss_layer_012": 0.000284, "vq_loss_layer_013": 0.000238, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000307, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.00025, "vq_loss_layer_018": 0.000155, "vq_loss_layer_019": 0.000143, "vq_loss_layer_020": 0.000156, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.000208, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.00024, "vq_loss_layer_026": 0.000414, "vq_loss_layer_027": 0.000401, "vq_loss_layer_028": 0.000648, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.001801, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.326348, "epoch": 0.01955, "grad_norm": 0.0009961427422240376, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010681, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.049316, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.096191, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.094727, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.091797, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.082031, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.04946, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049832, "step": 19550, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007263, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000147, "vq_loss_layer_009": 0.000191, "vq_loss_layer_010": 0.000176, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000357, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000286, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000186, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000248, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000454, "vq_loss_layer_027": 0.000542, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.002243, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.350509, "epoch": 0.01956, "grad_norm": 0.0011096466332674026, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.050049, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.103516, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.119141, "key_mse_loss_layer_014": 0.116699, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.097168, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.075195, "key_mse_loss_layer_025": 0.072754, "key_mse_loss_layer_026": 0.083008, "key_mse_loss_layer_027": 0.083984, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049185, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049557, "step": 19560, "value_mse_loss_layer_000": 0.000372, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007141, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.01416, "value_mse_loss_layer_017": 0.018066, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.029297, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.040039, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.047363, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000163, "vq_loss_layer_008": 0.000165, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000173, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000307, "vq_loss_layer_014": 0.000345, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000315, "vq_loss_layer_018": 0.000192, "vq_loss_layer_019": 0.000175, "vq_loss_layer_020": 0.000201, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000237, "vq_loss_layer_023": 0.000299, "vq_loss_layer_024": 0.000259, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.000463, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000755, "vq_loss_layer_029": 0.000824, "vq_loss_layer_030": 0.00193, "vq_loss_layer_031": 0.002487 }, { "ce_loss": 2.286221, "epoch": 0.01957, "grad_norm": 0.0011083598947152495, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.082031, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.111328, "key_mse_loss_layer_014": 0.108398, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.087402, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.074707, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078613, "key_mse_loss_layer_031": 0.06543, "kv_mse_loss": 0.049042, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049435, "step": 19570, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012756, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014771, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.050049, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000166, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000261, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000341, "vq_loss_layer_017": 0.000292, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.000326, "vq_loss_layer_022": 0.000233, "vq_loss_layer_023": 0.000227, "vq_loss_layer_024": 0.000239, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000397, "vq_loss_layer_027": 0.000488, "vq_loss_layer_028": 0.000694, "vq_loss_layer_029": 0.000896, "vq_loss_layer_030": 0.001541, "vq_loss_layer_031": 0.002716 }, { "ce_loss": 2.322852, "epoch": 0.01958, "grad_norm": 0.0011776949977502227, "key_mse_loss_layer_000": 0.003433, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056396, "key_mse_loss_layer_003": 0.05249, "key_mse_loss_layer_004": 0.05835, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.070312, "key_mse_loss_layer_007": 0.079102, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.100098, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.097656, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.07666, "kv_mse_loss": 0.049298, "kv_vq_loss": 0.000375, "learning_rate": 0.001, "loss": 0.049655, "step": 19580, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.001007, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011169, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.013367, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014893, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.016479, "value_mse_loss_layer_019": 0.018921, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029541, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030518, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.051758, "value_mse_loss_layer_030": 0.057373, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000168, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000169, "vq_loss_layer_011": 0.000204, "vq_loss_layer_012": 0.000315, "vq_loss_layer_013": 0.000315, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.00029, "vq_loss_layer_018": 0.000212, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000174, "vq_loss_layer_021": 0.00028, "vq_loss_layer_022": 0.000196, "vq_loss_layer_023": 0.000208, "vq_loss_layer_024": 0.000246, "vq_loss_layer_025": 0.000277, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000549, "vq_loss_layer_028": 0.00061, "vq_loss_layer_029": 0.000874, "vq_loss_layer_030": 0.001968, "vq_loss_layer_031": 0.00267 }, { "ce_loss": 2.31827, "epoch": 0.01959, "grad_norm": 0.001112503930926323, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.090332, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118652, "key_mse_loss_layer_015": 0.10791, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.107422, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.102539, "key_mse_loss_layer_021": 0.097656, "key_mse_loss_layer_022": 0.099609, "key_mse_loss_layer_023": 0.096191, "key_mse_loss_layer_024": 0.076172, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.090332, "key_mse_loss_layer_029": 0.083984, "key_mse_loss_layer_030": 0.090332, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049597, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049973, "step": 19590, "value_mse_loss_layer_000": 0.00036, "value_mse_loss_layer_001": 0.000965, "value_mse_loss_layer_002": 0.00383, "value_mse_loss_layer_003": 0.00708, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006042, "value_mse_loss_layer_006": 0.007996, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010315, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012024, "value_mse_loss_layer_012": 0.013123, "value_mse_loss_layer_013": 0.014282, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.013733, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.014832, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019775, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.024902, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.031982, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.037598, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000143, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000171, "vq_loss_layer_012": 0.000313, "vq_loss_layer_013": 0.000239, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000324, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000162, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000315, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.000311, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000687, "vq_loss_layer_029": 0.000721, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.00238 }, { "ce_loss": 2.31757, "epoch": 0.0196, "grad_norm": 0.0011657711584120989, "key_mse_loss_layer_000": 0.003326, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.068848, "key_mse_loss_layer_007": 0.078613, "key_mse_loss_layer_008": 0.086914, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.103027, "key_mse_loss_layer_011": 0.102051, "key_mse_loss_layer_012": 0.07666, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.115723, "key_mse_loss_layer_015": 0.105469, "key_mse_loss_layer_016": 0.096191, "key_mse_loss_layer_017": 0.099121, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.095703, "key_mse_loss_layer_022": 0.096191, "key_mse_loss_layer_023": 0.093262, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071777, "key_mse_loss_layer_026": 0.08252, "key_mse_loss_layer_027": 0.082031, "key_mse_loss_layer_028": 0.089844, "key_mse_loss_layer_029": 0.083496, "key_mse_loss_layer_030": 0.087402, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049399, "kv_vq_loss": 0.00038, "learning_rate": 0.001, "loss": 0.049771, "step": 19600, "value_mse_loss_layer_000": 0.000353, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015747, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014709, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015259, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.025757, "value_mse_loss_layer_024": 0.029053, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.055908, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000161, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000215, "vq_loss_layer_012": 0.000332, "vq_loss_layer_013": 0.00029, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000343, "vq_loss_layer_017": 0.000309, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000168, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000278, "vq_loss_layer_023": 0.000261, "vq_loss_layer_024": 0.000284, "vq_loss_layer_025": 0.000336, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.00087, "vq_loss_layer_030": 0.001877, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.336972, "epoch": 0.01961, "grad_norm": 0.001068652025423944, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.053955, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.05249, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084473, "key_mse_loss_layer_010": 0.09668, "key_mse_loss_layer_011": 0.094727, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.096191, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.09082, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.070801, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.04939, "kv_vq_loss": 0.000374, "learning_rate": 0.001, "loss": 0.049759, "step": 19610, "value_mse_loss_layer_000": 0.000355, "value_mse_loss_layer_001": 0.000965, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.006927, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006134, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.023804, "value_mse_loss_layer_023": 0.027222, "value_mse_loss_layer_024": 0.030029, "value_mse_loss_layer_025": 0.034424, "value_mse_loss_layer_026": 0.032227, "value_mse_loss_layer_027": 0.040771, "value_mse_loss_layer_028": 0.047363, "value_mse_loss_layer_029": 0.053711, "value_mse_loss_layer_030": 0.056396, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000131, "vq_loss_layer_009": 0.000176, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000171, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000296, "vq_loss_layer_015": 0.000385, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000158, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000252, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000448, "vq_loss_layer_028": 0.000713, "vq_loss_layer_029": 0.000877, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.316998, "epoch": 0.01962, "grad_norm": 0.0011373901506885886, "key_mse_loss_layer_000": 0.002808, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.045166, "key_mse_loss_layer_004": 0.042725, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.092773, "key_mse_loss_layer_009": 0.100586, "key_mse_loss_layer_010": 0.111816, "key_mse_loss_layer_011": 0.107422, "key_mse_loss_layer_012": 0.080078, "key_mse_loss_layer_013": 0.144531, "key_mse_loss_layer_014": 0.140625, "key_mse_loss_layer_015": 0.126953, "key_mse_loss_layer_016": 0.121582, "key_mse_loss_layer_017": 0.120605, "key_mse_loss_layer_018": 0.126953, "key_mse_loss_layer_019": 0.101562, "key_mse_loss_layer_020": 0.118164, "key_mse_loss_layer_021": 0.113281, "key_mse_loss_layer_022": 0.117676, "key_mse_loss_layer_023": 0.11377, "key_mse_loss_layer_024": 0.089355, "key_mse_loss_layer_025": 0.083984, "key_mse_loss_layer_026": 0.098633, "key_mse_loss_layer_027": 0.095703, "key_mse_loss_layer_028": 0.104492, "key_mse_loss_layer_029": 0.091309, "key_mse_loss_layer_030": 0.102051, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049539, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.04993, "step": 19620, "value_mse_loss_layer_000": 0.000349, "value_mse_loss_layer_001": 0.000946, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007202, "value_mse_loss_layer_005": 0.006622, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.016724, "value_mse_loss_layer_016": 0.013489, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.014526, "value_mse_loss_layer_019": 0.017212, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.021362, "value_mse_loss_layer_023": 0.023071, "value_mse_loss_layer_024": 0.026001, "value_mse_loss_layer_025": 0.03125, "value_mse_loss_layer_026": 0.028198, "value_mse_loss_layer_027": 0.036377, "value_mse_loss_layer_028": 0.040771, "value_mse_loss_layer_029": 0.045898, "value_mse_loss_layer_030": 0.050781, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 6.1e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000192, "vq_loss_layer_009": 0.000223, "vq_loss_layer_010": 0.000205, "vq_loss_layer_011": 0.000208, "vq_loss_layer_012": 0.000353, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000401, "vq_loss_layer_015": 0.000343, "vq_loss_layer_016": 0.00033, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000174, "vq_loss_layer_020": 0.000238, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.000324, "vq_loss_layer_023": 0.000334, "vq_loss_layer_024": 0.00038, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000561, "vq_loss_layer_027": 0.000546, "vq_loss_layer_028": 0.000954, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003281 }, { "ce_loss": 2.299092, "epoch": 0.01963, "grad_norm": 0.00100104755256325, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.043945, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.062988, "key_mse_loss_layer_007": 0.072266, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.083984, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.109863, "key_mse_loss_layer_014": 0.10791, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.088379, "key_mse_loss_layer_017": 0.093262, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.094727, "key_mse_loss_layer_024": 0.079102, "key_mse_loss_layer_025": 0.075195, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.086914, "key_mse_loss_layer_028": 0.091309, "key_mse_loss_layer_029": 0.089355, "key_mse_loss_layer_030": 0.081543, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049271, "kv_vq_loss": 0.000374, "learning_rate": 0.001, "loss": 0.049619, "step": 19630, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013977, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.016479, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.014954, "value_mse_loss_layer_017": 0.018677, "value_mse_loss_layer_018": 0.016235, "value_mse_loss_layer_019": 0.019531, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.02356, "value_mse_loss_layer_022": 0.024292, "value_mse_loss_layer_023": 0.029297, "value_mse_loss_layer_024": 0.034424, "value_mse_loss_layer_025": 0.037109, "value_mse_loss_layer_026": 0.036377, "value_mse_loss_layer_027": 0.045654, "value_mse_loss_layer_028": 0.05249, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.06543, "value_mse_loss_layer_031": 0.052734, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 4.9e-05, "vq_loss_layer_006": 9.1e-05, "vq_loss_layer_007": 0.000132, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.00018, "vq_loss_layer_010": 0.000163, "vq_loss_layer_011": 0.000188, "vq_loss_layer_012": 0.000288, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000317, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000284, "vq_loss_layer_017": 0.000252, "vq_loss_layer_018": 0.000158, "vq_loss_layer_019": 0.000137, "vq_loss_layer_020": 0.000128, "vq_loss_layer_021": 0.000237, "vq_loss_layer_022": 0.000171, "vq_loss_layer_023": 0.000197, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.00037, "vq_loss_layer_027": 0.000404, "vq_loss_layer_028": 0.001015, "vq_loss_layer_029": 0.001076, "vq_loss_layer_030": 0.0019, "vq_loss_layer_031": 0.00322 }, { "ce_loss": 2.304569, "epoch": 0.01964, "grad_norm": 0.001150804222561419, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.09375, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.093262, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.069824, "kv_mse_loss": 0.049133, "kv_vq_loss": 0.000382, "learning_rate": 0.001, "loss": 0.049524, "step": 19640, "value_mse_loss_layer_000": 0.000366, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.047607, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.4e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000215, "vq_loss_layer_010": 0.000174, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.00032, "vq_loss_layer_013": 0.000273, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000315, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000156, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000212, "vq_loss_layer_023": 0.000222, "vq_loss_layer_024": 0.000225, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000446, "vq_loss_layer_027": 0.000534, "vq_loss_layer_028": 0.000641, "vq_loss_layer_029": 0.000965, "vq_loss_layer_030": 0.002167, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.318835, "epoch": 0.01965, "grad_norm": 0.0011240564053878188, "key_mse_loss_layer_000": 0.004852, "key_mse_loss_layer_001": 0.011658, "key_mse_loss_layer_002": 0.058838, "key_mse_loss_layer_003": 0.052734, "key_mse_loss_layer_004": 0.055908, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.071289, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.087891, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.089844, "key_mse_loss_layer_020": 0.098633, "key_mse_loss_layer_021": 0.09375, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.07666, "key_mse_loss_layer_025": 0.074219, "key_mse_loss_layer_026": 0.084473, "key_mse_loss_layer_027": 0.088867, "key_mse_loss_layer_028": 0.091797, "key_mse_loss_layer_029": 0.090332, "key_mse_loss_layer_030": 0.092285, "key_mse_loss_layer_031": 0.078613, "kv_mse_loss": 0.049503, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.049902, "step": 19650, "value_mse_loss_layer_000": 0.000402, "value_mse_loss_layer_001": 0.00106, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006531, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014404, "value_mse_loss_layer_010": 0.012146, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.014587, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018677, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.023438, "value_mse_loss_layer_022": 0.024414, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.030884, "value_mse_loss_layer_025": 0.035889, "value_mse_loss_layer_026": 0.032715, "value_mse_loss_layer_027": 0.043213, "value_mse_loss_layer_028": 0.046631, "value_mse_loss_layer_029": 0.056885, "value_mse_loss_layer_030": 0.063477, "value_mse_loss_layer_031": 0.050781, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000152, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000189, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000351, "vq_loss_layer_015": 0.000364, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000265, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.000147, "vq_loss_layer_020": 0.000177, "vq_loss_layer_021": 0.000284, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000205, "vq_loss_layer_024": 0.000243, "vq_loss_layer_025": 0.000351, "vq_loss_layer_026": 0.000473, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.000759, "vq_loss_layer_029": 0.001144, "vq_loss_layer_030": 0.002472, "vq_loss_layer_031": 0.003128 }, { "ce_loss": 2.3484, "epoch": 0.01966, "grad_norm": 0.001077729044482112, "key_mse_loss_layer_000": 0.003281, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.05127, "key_mse_loss_layer_004": 0.053223, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.097168, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.109375, "key_mse_loss_layer_014": 0.106445, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.098633, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092285, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.070312, "kv_mse_loss": 0.049203, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049606, "step": 19660, "value_mse_loss_layer_000": 0.000351, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014771, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.023438, "value_mse_loss_layer_023": 0.026367, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031738, "value_mse_loss_layer_027": 0.040283, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.058594, "value_mse_loss_layer_031": 0.047119, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000205, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000307, "vq_loss_layer_015": 0.000349, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000189, "vq_loss_layer_019": 0.000161, "vq_loss_layer_020": 0.000161, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000211, "vq_loss_layer_023": 0.00021, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000242, "vq_loss_layer_026": 0.000393, "vq_loss_layer_027": 0.000462, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000759, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.266016, "epoch": 0.01967, "grad_norm": 0.0010944193927571177, "key_mse_loss_layer_000": 0.003128, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052734, "key_mse_loss_layer_005": 0.060059, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.089844, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.088379, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078125, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.080078, "key_mse_loss_layer_031": 0.066895, "kv_mse_loss": 0.049435, "kv_vq_loss": 0.000391, "learning_rate": 0.001, "loss": 0.049817, "step": 19670, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007172, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015381, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018433, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020752, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028442, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030884, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000134, "vq_loss_layer_009": 0.000225, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000278, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.00029, "vq_loss_layer_017": 0.000277, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000145, "vq_loss_layer_020": 0.000191, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.0002, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.000207, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000477, "vq_loss_layer_028": 0.000736, "vq_loss_layer_029": 0.000813, "vq_loss_layer_030": 0.001625, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.291852, "epoch": 0.01968, "grad_norm": 0.0011409703874960542, "key_mse_loss_layer_000": 0.002991, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.053467, "key_mse_loss_layer_005": 0.062988, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.088379, "key_mse_loss_layer_023": 0.085449, "key_mse_loss_layer_024": 0.066895, "key_mse_loss_layer_025": 0.066895, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.07373, "key_mse_loss_layer_028": 0.080566, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049429, "kv_vq_loss": 0.000377, "learning_rate": 0.001, "loss": 0.049802, "step": 19680, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000973, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.01532, "value_mse_loss_layer_010": 0.012512, "value_mse_loss_layer_011": 0.013184, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015991, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018921, "value_mse_loss_layer_016": 0.015137, "value_mse_loss_layer_017": 0.019165, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.022827, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.029053, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041992, "value_mse_loss_layer_029": 0.04834, "value_mse_loss_layer_030": 0.051758, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.9e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000139, "vq_loss_layer_009": 0.00019, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000328, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000326, "vq_loss_layer_015": 0.000381, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000332, "vq_loss_layer_018": 0.000175, "vq_loss_layer_019": 0.000176, "vq_loss_layer_020": 0.000209, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000236, "vq_loss_layer_024": 0.000196, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000353, "vq_loss_layer_027": 0.000406, "vq_loss_layer_028": 0.000595, "vq_loss_layer_029": 0.000801, "vq_loss_layer_030": 0.001488, "vq_loss_layer_031": 0.00238 }, { "ce_loss": 2.294736, "epoch": 0.01969, "grad_norm": 0.0010111466981470585, "key_mse_loss_layer_000": 0.003525, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.047119, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.057617, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.07373, "key_mse_loss_layer_008": 0.081543, "key_mse_loss_layer_009": 0.084961, "key_mse_loss_layer_010": 0.096191, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.069824, "key_mse_loss_layer_013": 0.108887, "key_mse_loss_layer_014": 0.105469, "key_mse_loss_layer_015": 0.095215, "key_mse_loss_layer_016": 0.087402, "key_mse_loss_layer_017": 0.090332, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.072266, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080566, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.077637, "key_mse_loss_layer_031": 0.0625, "kv_mse_loss": 0.049472, "kv_vq_loss": 0.000398, "learning_rate": 0.001, "loss": 0.049869, "step": 19690, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007568, "value_mse_loss_layer_004": 0.007233, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015381, "value_mse_loss_layer_015": 0.016968, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.016846, "value_mse_loss_layer_018": 0.015564, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021362, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043701, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.058105, "value_mse_loss_layer_031": 0.048828, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000164, "vq_loss_layer_009": 0.000198, "vq_loss_layer_010": 0.000182, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.00029, "vq_loss_layer_013": 0.000248, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000328, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000184, "vq_loss_layer_019": 0.000195, "vq_loss_layer_020": 0.000176, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000221, "vq_loss_layer_024": 0.000288, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.000431, "vq_loss_layer_027": 0.000519, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003342 }, { "ce_loss": 2.320377, "epoch": 0.0197, "grad_norm": 0.0011448130244389176, "key_mse_loss_layer_000": 0.003265, "key_mse_loss_layer_001": 0.010376, "key_mse_loss_layer_002": 0.057373, "key_mse_loss_layer_003": 0.052246, "key_mse_loss_layer_004": 0.057861, "key_mse_loss_layer_005": 0.061768, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.099609, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069824, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.083008, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049673, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050052, "step": 19700, "value_mse_loss_layer_000": 0.00036, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.00386, "value_mse_loss_layer_003": 0.007721, "value_mse_loss_layer_004": 0.006775, "value_mse_loss_layer_005": 0.006226, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015137, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.045166, "value_mse_loss_layer_029": 0.052246, "value_mse_loss_layer_030": 0.056885, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 2.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000157, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000167, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000282, "vq_loss_layer_014": 0.000336, "vq_loss_layer_015": 0.000372, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000284, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000142, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.00029, "vq_loss_layer_022": 0.000199, "vq_loss_layer_023": 0.000226, "vq_loss_layer_024": 0.000208, "vq_loss_layer_025": 0.000275, "vq_loss_layer_026": 0.000416, "vq_loss_layer_027": 0.000511, "vq_loss_layer_028": 0.000656, "vq_loss_layer_029": 0.000942, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.288615, "epoch": 0.01971, "grad_norm": 0.0011133570224046707, "key_mse_loss_layer_000": 0.002884, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.053223, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.097656, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.115723, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.068359, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.079102, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.062256, "kv_mse_loss": 0.04938, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.049741, "step": 19710, "value_mse_loss_layer_000": 0.000349, "value_mse_loss_layer_001": 0.000946, "value_mse_loss_layer_002": 0.003754, "value_mse_loss_layer_003": 0.00708, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.015137, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012695, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.025879, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030029, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.043457, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.052979, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000207, "vq_loss_layer_010": 0.000161, "vq_loss_layer_011": 0.000186, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.00038, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000307, "vq_loss_layer_018": 0.000157, "vq_loss_layer_019": 0.000152, "vq_loss_layer_020": 0.000185, "vq_loss_layer_021": 0.000332, "vq_loss_layer_022": 0.000217, "vq_loss_layer_023": 0.000223, "vq_loss_layer_024": 0.00022, "vq_loss_layer_025": 0.000263, "vq_loss_layer_026": 0.000368, "vq_loss_layer_027": 0.000406, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.001747, "vq_loss_layer_031": 0.002472 }, { "ce_loss": 2.30921, "epoch": 0.01972, "grad_norm": 0.0011099508265033364, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010742, "key_mse_loss_layer_002": 0.061035, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.068359, "key_mse_loss_layer_007": 0.078125, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.095703, "key_mse_loss_layer_010": 0.106934, "key_mse_loss_layer_011": 0.105469, "key_mse_loss_layer_012": 0.081543, "key_mse_loss_layer_013": 0.129883, "key_mse_loss_layer_014": 0.125977, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.101074, "key_mse_loss_layer_017": 0.102539, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.087891, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.081055, "key_mse_loss_layer_031": 0.060059, "kv_mse_loss": 0.049771, "kv_vq_loss": 0.000394, "learning_rate": 0.001, "loss": 0.050168, "step": 19720, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.001015, "value_mse_loss_layer_002": 0.004028, "value_mse_loss_layer_003": 0.00766, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008545, "value_mse_loss_layer_007": 0.009216, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.015991, "value_mse_loss_layer_010": 0.012878, "value_mse_loss_layer_011": 0.013611, "value_mse_loss_layer_012": 0.01532, "value_mse_loss_layer_013": 0.016724, "value_mse_loss_layer_014": 0.016968, "value_mse_loss_layer_015": 0.018555, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015198, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021484, "value_mse_loss_layer_022": 0.021851, "value_mse_loss_layer_023": 0.02356, "value_mse_loss_layer_024": 0.026001, "value_mse_loss_layer_025": 0.031006, "value_mse_loss_layer_026": 0.029053, "value_mse_loss_layer_027": 0.036865, "value_mse_loss_layer_028": 0.041016, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1.2e-05, "vq_loss_layer_002": 1.4e-05, "vq_loss_layer_003": 3.1e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 6.9e-05, "vq_loss_layer_006": 0.000119, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.00018, "vq_loss_layer_009": 0.000294, "vq_loss_layer_010": 0.000238, "vq_loss_layer_011": 0.000235, "vq_loss_layer_012": 0.00037, "vq_loss_layer_013": 0.000336, "vq_loss_layer_014": 0.000433, "vq_loss_layer_015": 0.000465, "vq_loss_layer_016": 0.000479, "vq_loss_layer_017": 0.000383, "vq_loss_layer_018": 0.000208, "vq_loss_layer_019": 0.000204, "vq_loss_layer_020": 0.000259, "vq_loss_layer_021": 0.000431, "vq_loss_layer_022": 0.00036, "vq_loss_layer_023": 0.000357, "vq_loss_layer_024": 0.000381, "vq_loss_layer_025": 0.000553, "vq_loss_layer_026": 0.000694, "vq_loss_layer_027": 0.00079, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.001091, "vq_loss_layer_030": 0.002747, "vq_loss_layer_031": 0.004028 }, { "ce_loss": 2.282072, "epoch": 0.01973, "grad_norm": 0.001061954302713275, "key_mse_loss_layer_000": 0.002869, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.054443, "key_mse_loss_layer_003": 0.048584, "key_mse_loss_layer_004": 0.049072, "key_mse_loss_layer_005": 0.058838, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117188, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.101074, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.089355, "key_mse_loss_layer_020": 0.099121, "key_mse_loss_layer_021": 0.095215, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.09375, "key_mse_loss_layer_024": 0.07373, "key_mse_loss_layer_025": 0.071289, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087891, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049292, "kv_vq_loss": 0.000378, "learning_rate": 0.001, "loss": 0.049652, "step": 19730, "value_mse_loss_layer_000": 0.000351, "value_mse_loss_layer_001": 0.000961, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006836, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013306, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015442, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.013733, "value_mse_loss_layer_017": 0.017334, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.027832, "value_mse_loss_layer_025": 0.032227, "value_mse_loss_layer_026": 0.028931, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.047852, "value_mse_loss_layer_030": 0.05249, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.00014, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000199, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000366, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000256, "vq_loss_layer_018": 0.000169, "vq_loss_layer_019": 0.000149, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000311, "vq_loss_layer_022": 0.000226, "vq_loss_layer_023": 0.00024, "vq_loss_layer_024": 0.000213, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000374, "vq_loss_layer_027": 0.000402, "vq_loss_layer_028": 0.000751, "vq_loss_layer_029": 0.000729, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.00264 }, { "ce_loss": 2.340791, "epoch": 0.01974, "grad_norm": 0.0011089438339695334, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.052246, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.091797, "key_mse_loss_layer_017": 0.096191, "key_mse_loss_layer_018": 0.099609, "key_mse_loss_layer_019": 0.086426, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.092285, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049515, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049908, "step": 19740, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006683, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012024, "value_mse_loss_layer_011": 0.012939, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015625, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.018799, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.018555, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020264, "value_mse_loss_layer_021": 0.023315, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051025, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000153, "vq_loss_layer_008": 0.000148, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000168, "vq_loss_layer_011": 0.000206, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000286, "vq_loss_layer_014": 0.000364, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000351, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.00016, "vq_loss_layer_020": 0.000216, "vq_loss_layer_021": 0.000351, "vq_loss_layer_022": 0.000243, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000226, "vq_loss_layer_025": 0.00028, "vq_loss_layer_026": 0.000448, "vq_loss_layer_027": 0.000492, "vq_loss_layer_028": 0.000771, "vq_loss_layer_029": 0.000858, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.32684, "epoch": 0.01975, "grad_norm": 0.001034649321809411, "key_mse_loss_layer_000": 0.003159, "key_mse_loss_layer_001": 0.010498, "key_mse_loss_layer_002": 0.056152, "key_mse_loss_layer_003": 0.049316, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.115234, "key_mse_loss_layer_014": 0.111816, "key_mse_loss_layer_015": 0.100098, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095215, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091797, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.084961, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049539, "kv_vq_loss": 0.000392, "learning_rate": 0.001, "loss": 0.049939, "step": 19750, "value_mse_loss_layer_000": 0.000349, "value_mse_loss_layer_001": 0.000977, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006958, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013428, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.023071, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.033691, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.042969, "value_mse_loss_layer_029": 0.050537, "value_mse_loss_layer_030": 0.055664, "value_mse_loss_layer_031": 0.046387, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.7e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000153, "vq_loss_layer_009": 0.000203, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000196, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000244, "vq_loss_layer_014": 0.000313, "vq_loss_layer_015": 0.000326, "vq_loss_layer_016": 0.000305, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000159, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000277, "vq_loss_layer_022": 0.000223, "vq_loss_layer_023": 0.00023, "vq_loss_layer_024": 0.000203, "vq_loss_layer_025": 0.000282, "vq_loss_layer_026": 0.000402, "vq_loss_layer_027": 0.00046, "vq_loss_layer_028": 0.000675, "vq_loss_layer_029": 0.000835, "vq_loss_layer_030": 0.001709, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.31422, "epoch": 0.01976, "grad_norm": 0.0010686952155083418, "key_mse_loss_layer_000": 0.002762, "key_mse_loss_layer_001": 0.009644, "key_mse_loss_layer_002": 0.052002, "key_mse_loss_layer_003": 0.044434, "key_mse_loss_layer_004": 0.04541, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.063477, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.098633, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.121582, "key_mse_loss_layer_014": 0.118164, "key_mse_loss_layer_015": 0.10498, "key_mse_loss_layer_016": 0.097656, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.10498, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.097656, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.070312, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.07666, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.077148, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049243, "kv_vq_loss": 0.000377, "learning_rate": 0.001, "loss": 0.0496, "step": 19760, "value_mse_loss_layer_000": 0.000347, "value_mse_loss_layer_001": 0.000946, "value_mse_loss_layer_002": 0.003708, "value_mse_loss_layer_003": 0.007019, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006073, "value_mse_loss_layer_006": 0.007935, "value_mse_loss_layer_007": 0.008423, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014648, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.017212, "value_mse_loss_layer_016": 0.01355, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015015, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019287, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.020996, "value_mse_loss_layer_023": 0.024292, "value_mse_loss_layer_024": 0.026001, "value_mse_loss_layer_025": 0.03125, "value_mse_loss_layer_026": 0.027588, "value_mse_loss_layer_027": 0.035156, "value_mse_loss_layer_028": 0.039795, "value_mse_loss_layer_029": 0.045166, "value_mse_loss_layer_030": 0.050781, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000168, "vq_loss_layer_009": 0.000193, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000201, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.000343, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000288, "vq_loss_layer_017": 0.000267, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.00018, "vq_loss_layer_021": 0.000336, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000296, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000328, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000652, "vq_loss_layer_030": 0.001572, "vq_loss_layer_031": 0.002762 }, { "ce_loss": 2.288143, "epoch": 0.01977, "grad_norm": 0.0011605522595345974, "key_mse_loss_layer_000": 0.003006, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.083008, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.11084, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.095703, "key_mse_loss_layer_016": 0.086914, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091309, "key_mse_loss_layer_021": 0.087891, "key_mse_loss_layer_022": 0.087891, "key_mse_loss_layer_023": 0.085938, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.066406, "key_mse_loss_layer_026": 0.075195, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.081543, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.078125, "key_mse_loss_layer_031": 0.066406, "kv_mse_loss": 0.049399, "kv_vq_loss": 0.000378, "learning_rate": 0.001, "loss": 0.049765, "step": 19770, "value_mse_loss_layer_000": 0.000347, "value_mse_loss_layer_001": 0.000954, "value_mse_loss_layer_002": 0.003784, "value_mse_loss_layer_003": 0.007111, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006256, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.015259, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.01532, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.01532, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019897, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.022949, "value_mse_loss_layer_023": 0.026001, "value_mse_loss_layer_024": 0.029419, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.030273, "value_mse_loss_layer_027": 0.03833, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.049805, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 0.000103, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000141, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.000191, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000265, "vq_loss_layer_014": 0.000309, "vq_loss_layer_015": 0.000391, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.00017, "vq_loss_layer_019": 0.000148, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000303, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000228, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000241, "vq_loss_layer_026": 0.000372, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000706, "vq_loss_layer_029": 0.000774, "vq_loss_layer_030": 0.001724, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.251214, "epoch": 0.01978, "grad_norm": 0.0013327670749276876, "key_mse_loss_layer_000": 0.004761, "key_mse_loss_layer_001": 0.01178, "key_mse_loss_layer_002": 0.05835, "key_mse_loss_layer_003": 0.04834, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.057861, "key_mse_loss_layer_006": 0.06543, "key_mse_loss_layer_007": 0.073242, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.097656, "key_mse_loss_layer_011": 0.095703, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109375, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094727, "key_mse_loss_layer_018": 0.102539, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.096191, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.094727, "key_mse_loss_layer_023": 0.095703, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.07666, "key_mse_loss_layer_026": 0.085938, "key_mse_loss_layer_027": 0.089355, "key_mse_loss_layer_028": 0.092285, "key_mse_loss_layer_029": 0.091797, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.049713, "kv_vq_loss": 0.000396, "learning_rate": 0.001, "loss": 0.050113, "step": 19780, "value_mse_loss_layer_000": 0.000401, "value_mse_loss_layer_001": 0.001045, "value_mse_loss_layer_002": 0.004059, "value_mse_loss_layer_003": 0.007812, "value_mse_loss_layer_004": 0.007324, "value_mse_loss_layer_005": 0.006592, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012146, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015015, "value_mse_loss_layer_014": 0.015625, "value_mse_loss_layer_015": 0.01709, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.016602, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.021118, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.028687, "value_mse_loss_layer_024": 0.033447, "value_mse_loss_layer_025": 0.037842, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.047852, "value_mse_loss_layer_028": 0.051514, "value_mse_loss_layer_029": 0.061279, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.055908, "vq_loss_layer_000": 5e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1e-05, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.000179, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000256, "vq_loss_layer_018": 0.000193, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.000155, "vq_loss_layer_021": 0.000241, "vq_loss_layer_022": 0.000179, "vq_loss_layer_023": 0.000211, "vq_loss_layer_024": 0.00021, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.00042, "vq_loss_layer_027": 0.0005, "vq_loss_layer_028": 0.000961, "vq_loss_layer_029": 0.00116, "vq_loss_layer_030": 0.002014, "vq_loss_layer_031": 0.003799 }, { "ce_loss": 2.330929, "epoch": 0.01979, "grad_norm": 0.0010231606429442763, "key_mse_loss_layer_000": 0.002975, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.052979, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112793, "key_mse_loss_layer_015": 0.101562, "key_mse_loss_layer_016": 0.093262, "key_mse_loss_layer_017": 0.09668, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.095215, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.070801, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.078125, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049619, "kv_vq_loss": 0.000385, "learning_rate": 0.001, "loss": 0.050015, "step": 19790, "value_mse_loss_layer_000": 0.000343, "value_mse_loss_layer_001": 0.00095, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.00708, "value_mse_loss_layer_004": 0.006653, "value_mse_loss_layer_005": 0.006378, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.015015, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022461, "value_mse_loss_layer_023": 0.025513, "value_mse_loss_layer_024": 0.027954, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029297, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.045166, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.6e-05, "vq_loss_layer_006": 9.7e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.000144, "vq_loss_layer_009": 0.000216, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000192, "vq_loss_layer_012": 0.000307, "vq_loss_layer_013": 0.000298, "vq_loss_layer_014": 0.000349, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000309, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000178, "vq_loss_layer_019": 0.000135, "vq_loss_layer_020": 0.000184, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000259, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000427, "vq_loss_layer_027": 0.000473, "vq_loss_layer_028": 0.000629, "vq_loss_layer_029": 0.000744, "vq_loss_layer_030": 0.00177, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.302656, "epoch": 0.0198, "grad_norm": 0.001201901468448341, "key_mse_loss_layer_000": 0.002686, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.056885, "key_mse_loss_layer_003": 0.044678, "key_mse_loss_layer_004": 0.042236, "key_mse_loss_layer_005": 0.059326, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.091309, "key_mse_loss_layer_009": 0.098633, "key_mse_loss_layer_010": 0.111816, "key_mse_loss_layer_011": 0.106445, "key_mse_loss_layer_012": 0.080566, "key_mse_loss_layer_013": 0.146484, "key_mse_loss_layer_014": 0.142578, "key_mse_loss_layer_015": 0.12793, "key_mse_loss_layer_016": 0.124512, "key_mse_loss_layer_017": 0.123535, "key_mse_loss_layer_018": 0.12793, "key_mse_loss_layer_019": 0.100098, "key_mse_loss_layer_020": 0.116699, "key_mse_loss_layer_021": 0.109863, "key_mse_loss_layer_022": 0.117676, "key_mse_loss_layer_023": 0.112793, "key_mse_loss_layer_024": 0.087402, "key_mse_loss_layer_025": 0.082031, "key_mse_loss_layer_026": 0.097656, "key_mse_loss_layer_027": 0.092285, "key_mse_loss_layer_028": 0.103027, "key_mse_loss_layer_029": 0.086426, "key_mse_loss_layer_030": 0.099609, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049417, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.049811, "step": 19800, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000957, "value_mse_loss_layer_002": 0.003937, "value_mse_loss_layer_003": 0.007507, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.008728, "value_mse_loss_layer_008": 0.010437, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.013916, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015137, "value_mse_loss_layer_015": 0.016479, "value_mse_loss_layer_016": 0.013, "value_mse_loss_layer_017": 0.017212, "value_mse_loss_layer_018": 0.014099, "value_mse_loss_layer_019": 0.016846, "value_mse_loss_layer_020": 0.018799, "value_mse_loss_layer_021": 0.020508, "value_mse_loss_layer_022": 0.019775, "value_mse_loss_layer_023": 0.021606, "value_mse_loss_layer_024": 0.023804, "value_mse_loss_layer_025": 0.029419, "value_mse_loss_layer_026": 0.025269, "value_mse_loss_layer_027": 0.032471, "value_mse_loss_layer_028": 0.037109, "value_mse_loss_layer_029": 0.040771, "value_mse_loss_layer_030": 0.046875, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1e-05, "vq_loss_layer_002": 1.5e-05, "vq_loss_layer_003": 3e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 0.000113, "vq_loss_layer_007": 0.000146, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000208, "vq_loss_layer_010": 0.000203, "vq_loss_layer_011": 0.000226, "vq_loss_layer_012": 0.000359, "vq_loss_layer_013": 0.000256, "vq_loss_layer_014": 0.000395, "vq_loss_layer_015": 0.000362, "vq_loss_layer_016": 0.000301, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000188, "vq_loss_layer_019": 0.000158, "vq_loss_layer_020": 0.000235, "vq_loss_layer_021": 0.000387, "vq_loss_layer_022": 0.000309, "vq_loss_layer_023": 0.000351, "vq_loss_layer_024": 0.000328, "vq_loss_layer_025": 0.000534, "vq_loss_layer_026": 0.0005, "vq_loss_layer_027": 0.000483, "vq_loss_layer_028": 0.001076, "vq_loss_layer_029": 0.000641, "vq_loss_layer_030": 0.001999, "vq_loss_layer_031": 0.003479 }, { "ce_loss": 2.310392, "epoch": 0.01981, "grad_norm": 0.0011254562996327877, "key_mse_loss_layer_000": 0.003403, "key_mse_loss_layer_001": 0.010193, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048828, "key_mse_loss_layer_004": 0.051758, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101074, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.07373, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.11377, "key_mse_loss_layer_015": 0.102051, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.097168, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.095703, "key_mse_loss_layer_021": 0.091309, "key_mse_loss_layer_022": 0.092773, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.085449, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.04953, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.049908, "step": 19810, "value_mse_loss_layer_000": 0.000349, "value_mse_loss_layer_001": 0.000969, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006744, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011963, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013794, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.0177, "value_mse_loss_layer_016": 0.014404, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028564, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029785, "value_mse_loss_layer_027": 0.038086, "value_mse_loss_layer_028": 0.042725, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.9e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000107, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000202, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000334, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000332, "vq_loss_layer_017": 0.000275, "vq_loss_layer_018": 0.000191, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000292, "vq_loss_layer_022": 0.000215, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000443, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000618, "vq_loss_layer_029": 0.000763, "vq_loss_layer_030": 0.001785, "vq_loss_layer_031": 0.002747 }, { "ce_loss": 2.334298, "epoch": 0.01982, "grad_norm": 0.0010451037669554353, "key_mse_loss_layer_000": 0.003662, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.051514, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083984, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.114258, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.100586, "key_mse_loss_layer_016": 0.092773, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.102051, "key_mse_loss_layer_019": 0.088867, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.094238, "key_mse_loss_layer_023": 0.092773, "key_mse_loss_layer_024": 0.074707, "key_mse_loss_layer_025": 0.072266, "key_mse_loss_layer_026": 0.081543, "key_mse_loss_layer_027": 0.083496, "key_mse_loss_layer_028": 0.088867, "key_mse_loss_layer_029": 0.085449, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049524, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049899, "step": 19820, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.000992, "value_mse_loss_layer_002": 0.003891, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006195, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008362, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014465, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012817, "value_mse_loss_layer_012": 0.013489, "value_mse_loss_layer_013": 0.014526, "value_mse_loss_layer_014": 0.01532, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015625, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.020508, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.029785, "value_mse_loss_layer_025": 0.034912, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.041016, "value_mse_loss_layer_028": 0.045898, "value_mse_loss_layer_029": 0.052002, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.8e-05, "vq_loss_layer_004": 5e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000138, "vq_loss_layer_008": 0.000138, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000202, "vq_loss_layer_012": 0.000301, "vq_loss_layer_013": 0.000243, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000336, "vq_loss_layer_016": 0.000294, "vq_loss_layer_017": 0.000259, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.000169, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000204, "vq_loss_layer_024": 0.000193, "vq_loss_layer_025": 0.000239, "vq_loss_layer_026": 0.00038, "vq_loss_layer_027": 0.000446, "vq_loss_layer_028": 0.00066, "vq_loss_layer_029": 0.000832, "vq_loss_layer_030": 0.001617, "vq_loss_layer_031": 0.002365 }, { "ce_loss": 2.308925, "epoch": 0.01983, "grad_norm": 0.0014278049347922206, "key_mse_loss_layer_000": 0.00354, "key_mse_loss_layer_001": 0.010803, "key_mse_loss_layer_002": 0.055664, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.050293, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.099121, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.071777, "key_mse_loss_layer_013": 0.112305, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.071289, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.081055, "key_mse_loss_layer_028": 0.086914, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.088379, "key_mse_loss_layer_031": 0.075684, "kv_mse_loss": 0.049664, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.050046, "step": 19830, "value_mse_loss_layer_000": 0.000368, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010742, "value_mse_loss_layer_009": 0.014832, "value_mse_loss_layer_010": 0.011719, "value_mse_loss_layer_011": 0.012451, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.015442, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026611, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044678, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.047852, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 3e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.3e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 0.000111, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000158, "vq_loss_layer_009": 0.000196, "vq_loss_layer_010": 0.000164, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000294, "vq_loss_layer_013": 0.000288, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000282, "vq_loss_layer_018": 0.000187, "vq_loss_layer_019": 0.000159, "vq_loss_layer_020": 0.000178, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000242, "vq_loss_layer_023": 0.000254, "vq_loss_layer_024": 0.000257, "vq_loss_layer_025": 0.000315, "vq_loss_layer_026": 0.000507, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000732, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.003296 }, { "ce_loss": 2.328938, "epoch": 0.01984, "grad_norm": 0.0010111337760463357, "key_mse_loss_layer_000": 0.002899, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.047607, "key_mse_loss_layer_004": 0.047852, "key_mse_loss_layer_005": 0.05835, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.08252, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.071289, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.11084, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.091309, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100098, "key_mse_loss_layer_019": 0.085449, "key_mse_loss_layer_020": 0.094238, "key_mse_loss_layer_021": 0.088867, "key_mse_loss_layer_022": 0.090332, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.077637, "key_mse_loss_layer_027": 0.075684, "key_mse_loss_layer_028": 0.083496, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.07959, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049484, "kv_vq_loss": 0.000389, "learning_rate": 0.001, "loss": 0.049872, "step": 19840, "value_mse_loss_layer_000": 0.000351, "value_mse_loss_layer_001": 0.000965, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007355, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010681, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014343, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.018066, "value_mse_loss_layer_020": 0.02002, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.022217, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.028076, "value_mse_loss_layer_025": 0.032715, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.052246, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.1e-05, "vq_loss_layer_005": 5.4e-05, "vq_loss_layer_006": 9.5e-05, "vq_loss_layer_007": 0.000142, "vq_loss_layer_008": 0.000149, "vq_loss_layer_009": 0.000206, "vq_loss_layer_010": 0.000162, "vq_loss_layer_011": 0.000174, "vq_loss_layer_012": 0.000296, "vq_loss_layer_013": 0.000269, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.000278, "vq_loss_layer_018": 0.000168, "vq_loss_layer_019": 0.000151, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000222, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.000236, "vq_loss_layer_025": 0.000296, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000458, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.000732, "vq_loss_layer_030": 0.001984, "vq_loss_layer_031": 0.002869 }, { "ce_loss": 2.292154, "epoch": 0.01985, "grad_norm": 0.0011570032220333815, "key_mse_loss_layer_000": 0.003418, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.056641, "key_mse_loss_layer_003": 0.050537, "key_mse_loss_layer_004": 0.056641, "key_mse_loss_layer_005": 0.061279, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.085938, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070801, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.097168, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092773, "key_mse_loss_layer_018": 0.097168, "key_mse_loss_layer_019": 0.084961, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087402, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.078613, "key_mse_loss_layer_028": 0.084961, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.083496, "key_mse_loss_layer_031": 0.071289, "kv_mse_loss": 0.049335, "kv_vq_loss": 0.000387, "learning_rate": 0.001, "loss": 0.049716, "step": 19850, "value_mse_loss_layer_000": 0.000364, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007233, "value_mse_loss_layer_004": 0.006592, "value_mse_loss_layer_005": 0.006042, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008362, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014709, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.012329, "value_mse_loss_layer_012": 0.013367, "value_mse_loss_layer_013": 0.014832, "value_mse_loss_layer_014": 0.015503, "value_mse_loss_layer_015": 0.018066, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017944, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018188, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.031128, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.044922, "value_mse_loss_layer_029": 0.052734, "value_mse_loss_layer_030": 0.056641, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.8e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000105, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000145, "vq_loss_layer_009": 0.000195, "vq_loss_layer_010": 0.000159, "vq_loss_layer_011": 0.00018, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000322, "vq_loss_layer_015": 0.00036, "vq_loss_layer_016": 0.000296, "vq_loss_layer_017": 0.000296, "vq_loss_layer_018": 0.000154, "vq_loss_layer_019": 0.000139, "vq_loss_layer_020": 0.00016, "vq_loss_layer_021": 0.000301, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000209, "vq_loss_layer_024": 0.000206, "vq_loss_layer_025": 0.000265, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000504, "vq_loss_layer_028": 0.000664, "vq_loss_layer_029": 0.000797, "vq_loss_layer_030": 0.002121, "vq_loss_layer_031": 0.002457 }, { "ce_loss": 2.249171, "epoch": 0.01986, "grad_norm": 0.0011090326588600874, "key_mse_loss_layer_000": 0.002762, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.053711, "key_mse_loss_layer_003": 0.046875, "key_mse_loss_layer_004": 0.049561, "key_mse_loss_layer_005": 0.061035, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.088867, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.076172, "key_mse_loss_layer_013": 0.120605, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.104004, "key_mse_loss_layer_016": 0.094727, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.101562, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.092285, "key_mse_loss_layer_023": 0.089355, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.078613, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.084473, "key_mse_loss_layer_029": 0.080566, "key_mse_loss_layer_030": 0.079102, "key_mse_loss_layer_031": 0.062988, "kv_mse_loss": 0.049847, "kv_vq_loss": 0.000386, "learning_rate": 0.001, "loss": 0.050232, "step": 19860, "value_mse_loss_layer_000": 0.000347, "value_mse_loss_layer_001": 0.000957, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007202, "value_mse_loss_layer_004": 0.006805, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.008423, "value_mse_loss_layer_007": 0.009094, "value_mse_loss_layer_008": 0.011108, "value_mse_loss_layer_009": 0.015625, "value_mse_loss_layer_010": 0.012756, "value_mse_loss_layer_011": 0.013428, "value_mse_loss_layer_012": 0.014893, "value_mse_loss_layer_013": 0.016357, "value_mse_loss_layer_014": 0.016724, "value_mse_loss_layer_015": 0.019043, "value_mse_loss_layer_016": 0.015076, "value_mse_loss_layer_017": 0.019287, "value_mse_loss_layer_018": 0.015503, "value_mse_loss_layer_019": 0.019043, "value_mse_loss_layer_020": 0.020996, "value_mse_loss_layer_021": 0.023193, "value_mse_loss_layer_022": 0.023926, "value_mse_loss_layer_023": 0.026855, "value_mse_loss_layer_024": 0.02832, "value_mse_loss_layer_025": 0.03418, "value_mse_loss_layer_026": 0.030762, "value_mse_loss_layer_027": 0.039062, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.050293, "value_mse_loss_layer_030": 0.054688, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000156, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000186, "vq_loss_layer_011": 0.000205, "vq_loss_layer_012": 0.000351, "vq_loss_layer_013": 0.000296, "vq_loss_layer_014": 0.000353, "vq_loss_layer_015": 0.000383, "vq_loss_layer_016": 0.00032, "vq_loss_layer_017": 0.000326, "vq_loss_layer_018": 0.000183, "vq_loss_layer_019": 0.000162, "vq_loss_layer_020": 0.00022, "vq_loss_layer_021": 0.000338, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000271, "vq_loss_layer_024": 0.000237, "vq_loss_layer_025": 0.000278, "vq_loss_layer_026": 0.000406, "vq_loss_layer_027": 0.000416, "vq_loss_layer_028": 0.000763, "vq_loss_layer_029": 0.000767, "vq_loss_layer_030": 0.001587, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.285728, "epoch": 0.01987, "grad_norm": 0.001207086257636547, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.012024, "key_mse_loss_layer_002": 0.069336, "key_mse_loss_layer_003": 0.053955, "key_mse_loss_layer_004": 0.050781, "key_mse_loss_layer_005": 0.06543, "key_mse_loss_layer_006": 0.072266, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.088379, "key_mse_loss_layer_009": 0.089844, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.105957, "key_mse_loss_layer_016": 0.100586, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.11377, "key_mse_loss_layer_019": 0.099609, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.099609, "key_mse_loss_layer_022": 0.110352, "key_mse_loss_layer_023": 0.109863, "key_mse_loss_layer_024": 0.091797, "key_mse_loss_layer_025": 0.086914, "key_mse_loss_layer_026": 0.102051, "key_mse_loss_layer_027": 0.110352, "key_mse_loss_layer_028": 0.111816, "key_mse_loss_layer_029": 0.106934, "key_mse_loss_layer_030": 0.117188, "key_mse_loss_layer_031": 0.094238, "kv_mse_loss": 0.049988, "kv_vq_loss": 0.000401, "learning_rate": 0.001, "loss": 0.050397, "step": 19870, "value_mse_loss_layer_000": 0.000328, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.00412, "value_mse_loss_layer_003": 0.007996, "value_mse_loss_layer_004": 0.007507, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.007812, "value_mse_loss_layer_007": 0.008179, "value_mse_loss_layer_008": 0.010376, "value_mse_loss_layer_009": 0.013062, "value_mse_loss_layer_010": 0.010681, "value_mse_loss_layer_011": 0.011353, "value_mse_loss_layer_012": 0.012024, "value_mse_loss_layer_013": 0.013123, "value_mse_loss_layer_014": 0.014771, "value_mse_loss_layer_015": 0.014771, "value_mse_loss_layer_016": 0.013367, "value_mse_loss_layer_017": 0.015625, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.024536, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.033691, "value_mse_loss_layer_025": 0.03833, "value_mse_loss_layer_026": 0.0354, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.050781, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.070312, "value_mse_loss_layer_031": 0.055176, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.3e-05, "vq_loss_layer_004": 5.5e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 8.2e-05, "vq_loss_layer_007": 0.000113, "vq_loss_layer_008": 0.000177, "vq_loss_layer_009": 0.000157, "vq_loss_layer_010": 0.00016, "vq_loss_layer_011": 0.000158, "vq_loss_layer_012": 0.000242, "vq_loss_layer_013": 0.000194, "vq_loss_layer_014": 0.000301, "vq_loss_layer_015": 0.000317, "vq_loss_layer_016": 0.000273, "vq_loss_layer_017": 0.000221, "vq_loss_layer_018": 0.000234, "vq_loss_layer_019": 0.000155, "vq_loss_layer_020": 0.000123, "vq_loss_layer_021": 0.000265, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000197, "vq_loss_layer_024": 0.000224, "vq_loss_layer_025": 0.000343, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000595, "vq_loss_layer_028": 0.00119, "vq_loss_layer_029": 0.001541, "vq_loss_layer_030": 0.003296, "vq_loss_layer_031": 0.005066 }, { "ce_loss": 2.282414, "epoch": 0.01988, "grad_norm": 0.0010441142367199063, "key_mse_loss_layer_000": 0.002838, "key_mse_loss_layer_001": 0.009827, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.049072, "key_mse_loss_layer_004": 0.054932, "key_mse_loss_layer_005": 0.0625, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.114746, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.094727, "key_mse_loss_layer_021": 0.089844, "key_mse_loss_layer_022": 0.089355, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.068359, "key_mse_loss_layer_025": 0.067383, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.08252, "key_mse_loss_layer_029": 0.07959, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.071777, "kv_mse_loss": 0.050058, "kv_vq_loss": 0.000393, "learning_rate": 0.001, "loss": 0.050446, "step": 19880, "value_mse_loss_layer_000": 0.00034, "value_mse_loss_layer_001": 0.000946, "value_mse_loss_layer_002": 0.003738, "value_mse_loss_layer_003": 0.007019, "value_mse_loss_layer_004": 0.0065, "value_mse_loss_layer_005": 0.006134, "value_mse_loss_layer_006": 0.00824, "value_mse_loss_layer_007": 0.00885, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012573, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.014038, "value_mse_loss_layer_013": 0.015503, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014465, "value_mse_loss_layer_017": 0.018311, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.020142, "value_mse_loss_layer_021": 0.022461, "value_mse_loss_layer_022": 0.022583, "value_mse_loss_layer_023": 0.025635, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.029663, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.04248, "value_mse_loss_layer_029": 0.049072, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.044922, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.9e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000162, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000178, "vq_loss_layer_012": 0.000326, "vq_loss_layer_013": 0.00032, "vq_loss_layer_014": 0.000324, "vq_loss_layer_015": 0.000351, "vq_loss_layer_016": 0.000286, "vq_loss_layer_017": 0.000294, "vq_loss_layer_018": 0.000173, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.00019, "vq_loss_layer_021": 0.000288, "vq_loss_layer_022": 0.000202, "vq_loss_layer_023": 0.00022, "vq_loss_layer_024": 0.000176, "vq_loss_layer_025": 0.000238, "vq_loss_layer_026": 0.00034, "vq_loss_layer_027": 0.000389, "vq_loss_layer_028": 0.000549, "vq_loss_layer_029": 0.00074, "vq_loss_layer_030": 0.001495, "vq_loss_layer_031": 0.002487 }, { "ce_loss": 2.250264, "epoch": 0.01989, "grad_norm": 0.0011823013192042708, "key_mse_loss_layer_000": 0.003235, "key_mse_loss_layer_001": 0.012085, "key_mse_loss_layer_002": 0.069824, "key_mse_loss_layer_003": 0.053955, "key_mse_loss_layer_004": 0.051025, "key_mse_loss_layer_005": 0.06543, "key_mse_loss_layer_006": 0.071777, "key_mse_loss_layer_007": 0.074219, "key_mse_loss_layer_008": 0.088867, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116211, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.106934, "key_mse_loss_layer_016": 0.102051, "key_mse_loss_layer_017": 0.099609, "key_mse_loss_layer_018": 0.114258, "key_mse_loss_layer_019": 0.099121, "key_mse_loss_layer_020": 0.105957, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.110352, "key_mse_loss_layer_023": 0.11084, "key_mse_loss_layer_024": 0.092773, "key_mse_loss_layer_025": 0.087402, "key_mse_loss_layer_026": 0.103027, "key_mse_loss_layer_027": 0.11084, "key_mse_loss_layer_028": 0.111816, "key_mse_loss_layer_029": 0.10791, "key_mse_loss_layer_030": 0.118164, "key_mse_loss_layer_031": 0.095703, "kv_mse_loss": 0.049814, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050204, "step": 19890, "value_mse_loss_layer_000": 0.000324, "value_mse_loss_layer_001": 0.000999, "value_mse_loss_layer_002": 0.004089, "value_mse_loss_layer_003": 0.008118, "value_mse_loss_layer_004": 0.007263, "value_mse_loss_layer_005": 0.006439, "value_mse_loss_layer_006": 0.007812, "value_mse_loss_layer_007": 0.008179, "value_mse_loss_layer_008": 0.010315, "value_mse_loss_layer_009": 0.012817, "value_mse_loss_layer_010": 0.010498, "value_mse_loss_layer_011": 0.011658, "value_mse_loss_layer_012": 0.011902, "value_mse_loss_layer_013": 0.012878, "value_mse_loss_layer_014": 0.014893, "value_mse_loss_layer_015": 0.014465, "value_mse_loss_layer_016": 0.013306, "value_mse_loss_layer_017": 0.015625, "value_mse_loss_layer_018": 0.01709, "value_mse_loss_layer_019": 0.019165, "value_mse_loss_layer_020": 0.020874, "value_mse_loss_layer_021": 0.022339, "value_mse_loss_layer_022": 0.02417, "value_mse_loss_layer_023": 0.02832, "value_mse_loss_layer_024": 0.033936, "value_mse_loss_layer_025": 0.037598, "value_mse_loss_layer_026": 0.035889, "value_mse_loss_layer_027": 0.047119, "value_mse_loss_layer_028": 0.051025, "value_mse_loss_layer_029": 0.060791, "value_mse_loss_layer_030": 0.069824, "value_mse_loss_layer_031": 0.056152, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 1.1e-05, "vq_loss_layer_002": 1.7e-05, "vq_loss_layer_003": 3.8e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 4.8e-05, "vq_loss_layer_006": 8.5e-05, "vq_loss_layer_007": 0.000111, "vq_loss_layer_008": 0.000174, "vq_loss_layer_009": 0.00015, "vq_loss_layer_010": 0.000145, "vq_loss_layer_011": 0.000164, "vq_loss_layer_012": 0.000237, "vq_loss_layer_013": 0.000186, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000265, "vq_loss_layer_016": 0.000273, "vq_loss_layer_017": 0.000221, "vq_loss_layer_018": 0.000233, "vq_loss_layer_019": 0.000154, "vq_loss_layer_020": 0.000123, "vq_loss_layer_021": 0.000256, "vq_loss_layer_022": 0.000198, "vq_loss_layer_023": 0.000179, "vq_loss_layer_024": 0.000215, "vq_loss_layer_025": 0.000317, "vq_loss_layer_026": 0.000481, "vq_loss_layer_027": 0.00058, "vq_loss_layer_028": 0.001183, "vq_loss_layer_029": 0.001526, "vq_loss_layer_030": 0.002945, "vq_loss_layer_031": 0.00531 }, { "ce_loss": 2.349211, "epoch": 0.0199, "grad_norm": 0.0010870086262002587, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.052979, "key_mse_loss_layer_003": 0.044922, "key_mse_loss_layer_004": 0.043213, "key_mse_loss_layer_005": 0.056641, "key_mse_loss_layer_006": 0.063965, "key_mse_loss_layer_007": 0.072754, "key_mse_loss_layer_008": 0.087891, "key_mse_loss_layer_009": 0.092285, "key_mse_loss_layer_010": 0.105469, "key_mse_loss_layer_011": 0.100586, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.126953, "key_mse_loss_layer_014": 0.124512, "key_mse_loss_layer_015": 0.112305, "key_mse_loss_layer_016": 0.10791, "key_mse_loss_layer_017": 0.106445, "key_mse_loss_layer_018": 0.112305, "key_mse_loss_layer_019": 0.092773, "key_mse_loss_layer_020": 0.105469, "key_mse_loss_layer_021": 0.099121, "key_mse_loss_layer_022": 0.104492, "key_mse_loss_layer_023": 0.102051, "key_mse_loss_layer_024": 0.081543, "key_mse_loss_layer_025": 0.077637, "key_mse_loss_layer_026": 0.09082, "key_mse_loss_layer_027": 0.09082, "key_mse_loss_layer_028": 0.097168, "key_mse_loss_layer_029": 0.088867, "key_mse_loss_layer_030": 0.097168, "key_mse_loss_layer_031": 0.068848, "kv_mse_loss": 0.049667, "kv_vq_loss": 0.000381, "learning_rate": 0.001, "loss": 0.050052, "step": 19900, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000965, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007385, "value_mse_loss_layer_004": 0.007172, "value_mse_loss_layer_005": 0.006409, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010376, "value_mse_loss_layer_009": 0.014404, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.011841, "value_mse_loss_layer_012": 0.012756, "value_mse_loss_layer_013": 0.013794, "value_mse_loss_layer_014": 0.014282, "value_mse_loss_layer_015": 0.015137, "value_mse_loss_layer_016": 0.012512, "value_mse_loss_layer_017": 0.016113, "value_mse_loss_layer_018": 0.014587, "value_mse_loss_layer_019": 0.016846, "value_mse_loss_layer_020": 0.018433, "value_mse_loss_layer_021": 0.020508, "value_mse_loss_layer_022": 0.020752, "value_mse_loss_layer_023": 0.023926, "value_mse_loss_layer_024": 0.026733, "value_mse_loss_layer_025": 0.031128, "value_mse_loss_layer_026": 0.027466, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.040527, "value_mse_loss_layer_029": 0.048096, "value_mse_loss_layer_030": 0.053955, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 7e-06, "vq_loss_layer_002": 7e-06, "vq_loss_layer_003": 2.3e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.3e-05, "vq_loss_layer_007": 0.000143, "vq_loss_layer_008": 0.000162, "vq_loss_layer_009": 0.000214, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000176, "vq_loss_layer_012": 0.000305, "vq_loss_layer_013": 0.000213, "vq_loss_layer_014": 0.000319, "vq_loss_layer_015": 0.000271, "vq_loss_layer_016": 0.000277, "vq_loss_layer_017": 0.000244, "vq_loss_layer_018": 0.000149, "vq_loss_layer_019": 0.000146, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000207, "vq_loss_layer_023": 0.000265, "vq_loss_layer_024": 0.000233, "vq_loss_layer_025": 0.000349, "vq_loss_layer_026": 0.00045, "vq_loss_layer_027": 0.000591, "vq_loss_layer_028": 0.000843, "vq_loss_layer_029": 0.000881, "vq_loss_layer_030": 0.00206, "vq_loss_layer_031": 0.003052 }, { "ce_loss": 2.340518, "epoch": 0.01991, "grad_norm": 0.001068222802132368, "key_mse_loss_layer_000": 0.003555, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.046631, "key_mse_loss_layer_004": 0.044189, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.066895, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.089355, "key_mse_loss_layer_009": 0.094238, "key_mse_loss_layer_010": 0.105957, "key_mse_loss_layer_011": 0.103516, "key_mse_loss_layer_012": 0.077637, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.108398, "key_mse_loss_layer_016": 0.098633, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104492, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.097168, "key_mse_loss_layer_023": 0.091797, "key_mse_loss_layer_024": 0.073242, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.082031, "key_mse_loss_layer_027": 0.081543, "key_mse_loss_layer_028": 0.088379, "key_mse_loss_layer_029": 0.081055, "key_mse_loss_layer_030": 0.083984, "key_mse_loss_layer_031": 0.063477, "kv_mse_loss": 0.049448, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.049832, "step": 19910, "value_mse_loss_layer_000": 0.00036, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007629, "value_mse_loss_layer_004": 0.007141, "value_mse_loss_layer_005": 0.006683, "value_mse_loss_layer_006": 0.008301, "value_mse_loss_layer_007": 0.008911, "value_mse_loss_layer_008": 0.010864, "value_mse_loss_layer_009": 0.014893, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.013123, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016357, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014526, "value_mse_loss_layer_017": 0.016479, "value_mse_loss_layer_018": 0.014648, "value_mse_loss_layer_019": 0.01709, "value_mse_loss_layer_020": 0.018433, "value_mse_loss_layer_021": 0.021851, "value_mse_loss_layer_022": 0.022095, "value_mse_loss_layer_023": 0.023926, "value_mse_loss_layer_024": 0.026855, "value_mse_loss_layer_025": 0.030762, "value_mse_loss_layer_026": 0.028687, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.04126, "value_mse_loss_layer_029": 0.047363, "value_mse_loss_layer_030": 0.052734, "value_mse_loss_layer_031": 0.045654, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 9e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 6.8e-05, "vq_loss_layer_006": 0.0001, "vq_loss_layer_007": 0.000149, "vq_loss_layer_008": 0.000175, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000211, "vq_loss_layer_011": 0.000212, "vq_loss_layer_012": 0.000322, "vq_loss_layer_013": 0.00028, "vq_loss_layer_014": 0.00041, "vq_loss_layer_015": 0.000443, "vq_loss_layer_016": 0.000433, "vq_loss_layer_017": 0.000271, "vq_loss_layer_018": 0.000176, "vq_loss_layer_019": 0.000182, "vq_loss_layer_020": 0.00024, "vq_loss_layer_021": 0.000378, "vq_loss_layer_022": 0.000328, "vq_loss_layer_023": 0.000317, "vq_loss_layer_024": 0.000381, "vq_loss_layer_025": 0.000443, "vq_loss_layer_026": 0.000629, "vq_loss_layer_027": 0.000652, "vq_loss_layer_028": 0.000893, "vq_loss_layer_029": 0.000854, "vq_loss_layer_030": 0.002151, "vq_loss_layer_031": 0.003403 }, { "ce_loss": 2.250889, "epoch": 0.01992, "grad_norm": 0.001110012293793261, "key_mse_loss_layer_000": 0.003204, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.055176, "key_mse_loss_layer_003": 0.047363, "key_mse_loss_layer_004": 0.045166, "key_mse_loss_layer_005": 0.058105, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.085449, "key_mse_loss_layer_009": 0.088379, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.074707, "key_mse_loss_layer_013": 0.12207, "key_mse_loss_layer_014": 0.117676, "key_mse_loss_layer_015": 0.107422, "key_mse_loss_layer_016": 0.099609, "key_mse_loss_layer_017": 0.101562, "key_mse_loss_layer_018": 0.105957, "key_mse_loss_layer_019": 0.090332, "key_mse_loss_layer_020": 0.099609, "key_mse_loss_layer_021": 0.094238, "key_mse_loss_layer_022": 0.097656, "key_mse_loss_layer_023": 0.097168, "key_mse_loss_layer_024": 0.078613, "key_mse_loss_layer_025": 0.07373, "key_mse_loss_layer_026": 0.084961, "key_mse_loss_layer_027": 0.083008, "key_mse_loss_layer_028": 0.089355, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.064941, "kv_mse_loss": 0.049469, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.049866, "step": 19920, "value_mse_loss_layer_000": 0.000347, "value_mse_loss_layer_001": 0.000961, "value_mse_loss_layer_002": 0.003845, "value_mse_loss_layer_003": 0.007477, "value_mse_loss_layer_004": 0.007111, "value_mse_loss_layer_005": 0.00647, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010498, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.012268, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.014648, "value_mse_loss_layer_014": 0.015259, "value_mse_loss_layer_015": 0.016602, "value_mse_loss_layer_016": 0.013733, "value_mse_loss_layer_017": 0.016724, "value_mse_loss_layer_018": 0.015076, "value_mse_loss_layer_019": 0.017578, "value_mse_loss_layer_020": 0.019043, "value_mse_loss_layer_021": 0.02124, "value_mse_loss_layer_022": 0.020996, "value_mse_loss_layer_023": 0.024536, "value_mse_loss_layer_024": 0.029175, "value_mse_loss_layer_025": 0.031982, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037354, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.048584, "value_mse_loss_layer_030": 0.053467, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 8e-06, "vq_loss_layer_002": 8e-06, "vq_loss_layer_003": 2.5e-05, "vq_loss_layer_004": 5.1e-05, "vq_loss_layer_005": 5.7e-05, "vq_loss_layer_006": 9.9e-05, "vq_loss_layer_007": 0.000154, "vq_loss_layer_008": 0.000161, "vq_loss_layer_009": 0.00021, "vq_loss_layer_010": 0.000175, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.000338, "vq_loss_layer_013": 0.000271, "vq_loss_layer_014": 0.000347, "vq_loss_layer_015": 0.000328, "vq_loss_layer_016": 0.000336, "vq_loss_layer_017": 0.000263, "vq_loss_layer_018": 0.000172, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000172, "vq_loss_layer_021": 0.000317, "vq_loss_layer_022": 0.000197, "vq_loss_layer_023": 0.000224, "vq_loss_layer_024": 0.000278, "vq_loss_layer_025": 0.00033, "vq_loss_layer_026": 0.000465, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000877, "vq_loss_layer_029": 0.000954, "vq_loss_layer_030": 0.002106, "vq_loss_layer_031": 0.003174 }, { "ce_loss": 2.333245, "epoch": 0.01993, "grad_norm": 0.0011284409556537867, "key_mse_loss_layer_000": 0.00322, "key_mse_loss_layer_001": 0.010071, "key_mse_loss_layer_002": 0.055908, "key_mse_loss_layer_003": 0.050781, "key_mse_loss_layer_004": 0.057373, "key_mse_loss_layer_005": 0.060303, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.07666, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098145, "key_mse_loss_layer_011": 0.09668, "key_mse_loss_layer_012": 0.070312, "key_mse_loss_layer_013": 0.110352, "key_mse_loss_layer_014": 0.107422, "key_mse_loss_layer_015": 0.09668, "key_mse_loss_layer_016": 0.087891, "key_mse_loss_layer_017": 0.092285, "key_mse_loss_layer_018": 0.09668, "key_mse_loss_layer_019": 0.084473, "key_mse_loss_layer_020": 0.091797, "key_mse_loss_layer_021": 0.088379, "key_mse_loss_layer_022": 0.088867, "key_mse_loss_layer_023": 0.086914, "key_mse_loss_layer_024": 0.069336, "key_mse_loss_layer_025": 0.067871, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.085938, "key_mse_loss_layer_031": 0.07373, "kv_mse_loss": 0.049484, "kv_vq_loss": 0.000383, "learning_rate": 0.001, "loss": 0.04986, "step": 19930, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000969, "value_mse_loss_layer_002": 0.003769, "value_mse_loss_layer_003": 0.006989, "value_mse_loss_layer_004": 0.006409, "value_mse_loss_layer_005": 0.005981, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008362, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014587, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.014893, "value_mse_loss_layer_014": 0.015869, "value_mse_loss_layer_015": 0.017944, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.015137, "value_mse_loss_layer_019": 0.018311, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022705, "value_mse_loss_layer_022": 0.023193, "value_mse_loss_layer_023": 0.026245, "value_mse_loss_layer_024": 0.028198, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.030396, "value_mse_loss_layer_027": 0.038574, "value_mse_loss_layer_028": 0.044189, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.056152, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 0.000106, "vq_loss_layer_007": 0.000155, "vq_loss_layer_008": 0.000135, "vq_loss_layer_009": 0.000185, "vq_loss_layer_010": 0.000156, "vq_loss_layer_011": 0.00019, "vq_loss_layer_012": 0.000317, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000332, "vq_loss_layer_015": 0.000345, "vq_loss_layer_016": 0.000298, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000151, "vq_loss_layer_019": 0.000153, "vq_loss_layer_020": 0.000171, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.000192, "vq_loss_layer_023": 0.000212, "vq_loss_layer_024": 0.000199, "vq_loss_layer_025": 0.000259, "vq_loss_layer_026": 0.000423, "vq_loss_layer_027": 0.000496, "vq_loss_layer_028": 0.000587, "vq_loss_layer_029": 0.000843, "vq_loss_layer_030": 0.001884, "vq_loss_layer_031": 0.002518 }, { "ce_loss": 2.357806, "epoch": 0.01994, "grad_norm": 0.00110428046900779, "key_mse_loss_layer_000": 0.00325, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.049805, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.059814, "key_mse_loss_layer_006": 0.066406, "key_mse_loss_layer_007": 0.075684, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086426, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.11377, "key_mse_loss_layer_014": 0.111328, "key_mse_loss_layer_015": 0.099121, "key_mse_loss_layer_016": 0.09082, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.090332, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.088867, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049731, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.050107, "step": 19940, "value_mse_loss_layer_000": 0.000353, "value_mse_loss_layer_001": 0.000965, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006897, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012512, "value_mse_loss_layer_012": 0.013672, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018555, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.023071, "value_mse_loss_layer_022": 0.023682, "value_mse_loss_layer_023": 0.026489, "value_mse_loss_layer_024": 0.030273, "value_mse_loss_layer_025": 0.034668, "value_mse_loss_layer_026": 0.031494, "value_mse_loss_layer_027": 0.040527, "value_mse_loss_layer_028": 0.04541, "value_mse_loss_layer_029": 0.05249, "value_mse_loss_layer_030": 0.057861, "value_mse_loss_layer_031": 0.046631, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 5e-06, "vq_loss_layer_003": 2e-05, "vq_loss_layer_004": 5.2e-05, "vq_loss_layer_005": 5.1e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000145, "vq_loss_layer_008": 0.000146, "vq_loss_layer_009": 0.000221, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000184, "vq_loss_layer_012": 0.000309, "vq_loss_layer_013": 0.000275, "vq_loss_layer_014": 0.000338, "vq_loss_layer_015": 0.000341, "vq_loss_layer_016": 0.000313, "vq_loss_layer_017": 0.000273, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000157, "vq_loss_layer_020": 0.00017, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000216, "vq_loss_layer_023": 0.000216, "vq_loss_layer_024": 0.000254, "vq_loss_layer_025": 0.000273, "vq_loss_layer_026": 0.000418, "vq_loss_layer_027": 0.000515, "vq_loss_layer_028": 0.000683, "vq_loss_layer_029": 0.000828, "vq_loss_layer_030": 0.001907, "vq_loss_layer_031": 0.002579 }, { "ce_loss": 2.357396, "epoch": 0.01995, "grad_norm": 0.0010310912039130926, "key_mse_loss_layer_000": 0.003342, "key_mse_loss_layer_001": 0.010315, "key_mse_loss_layer_002": 0.054932, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.051514, "key_mse_loss_layer_005": 0.060791, "key_mse_loss_layer_006": 0.067383, "key_mse_loss_layer_007": 0.076172, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.099609, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.109863, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.088867, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.099121, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.09375, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.087891, "key_mse_loss_layer_024": 0.069824, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.077148, "key_mse_loss_layer_027": 0.077637, "key_mse_loss_layer_028": 0.083984, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.08252, "key_mse_loss_layer_031": 0.070801, "kv_mse_loss": 0.049664, "kv_vq_loss": 0.000371, "learning_rate": 0.001, "loss": 0.050015, "step": 19950, "value_mse_loss_layer_000": 0.000362, "value_mse_loss_layer_001": 0.000973, "value_mse_loss_layer_002": 0.003815, "value_mse_loss_layer_003": 0.007019, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008057, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.011047, "value_mse_loss_layer_009": 0.015076, "value_mse_loss_layer_010": 0.012207, "value_mse_loss_layer_011": 0.013, "value_mse_loss_layer_012": 0.014221, "value_mse_loss_layer_013": 0.015869, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018677, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018433, "value_mse_loss_layer_018": 0.015747, "value_mse_loss_layer_019": 0.018799, "value_mse_loss_layer_020": 0.020386, "value_mse_loss_layer_021": 0.022827, "value_mse_loss_layer_022": 0.02356, "value_mse_loss_layer_023": 0.026123, "value_mse_loss_layer_024": 0.028809, "value_mse_loss_layer_025": 0.033447, "value_mse_loss_layer_026": 0.03064, "value_mse_loss_layer_027": 0.039795, "value_mse_loss_layer_028": 0.043945, "value_mse_loss_layer_029": 0.05127, "value_mse_loss_layer_030": 0.054443, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.2e-05, "vq_loss_layer_004": 4.2e-05, "vq_loss_layer_005": 6e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000156, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000192, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000195, "vq_loss_layer_012": 0.00033, "vq_loss_layer_013": 0.000332, "vq_loss_layer_014": 0.000328, "vq_loss_layer_015": 0.000397, "vq_loss_layer_016": 0.000311, "vq_loss_layer_017": 0.000299, "vq_loss_layer_018": 0.000194, "vq_loss_layer_019": 0.000179, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000298, "vq_loss_layer_022": 0.000244, "vq_loss_layer_023": 0.000256, "vq_loss_layer_024": 0.00025, "vq_loss_layer_025": 0.000301, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000507, "vq_loss_layer_028": 0.000828, "vq_loss_layer_029": 0.000992, "vq_loss_layer_030": 0.001656, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.311495, "epoch": 0.01996, "grad_norm": 0.0011397405760362744, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.010132, "key_mse_loss_layer_002": 0.05542, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.052002, "key_mse_loss_layer_005": 0.062012, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077637, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.091309, "key_mse_loss_layer_010": 0.102051, "key_mse_loss_layer_011": 0.101074, "key_mse_loss_layer_012": 0.074219, "key_mse_loss_layer_013": 0.118164, "key_mse_loss_layer_014": 0.113281, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.088379, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.093262, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.085938, "key_mse_loss_layer_029": 0.081543, "key_mse_loss_layer_030": 0.084473, "key_mse_loss_layer_031": 0.068359, "kv_mse_loss": 0.049426, "kv_vq_loss": 0.000388, "learning_rate": 0.001, "loss": 0.04982, "step": 19960, "value_mse_loss_layer_000": 0.000359, "value_mse_loss_layer_001": 0.000965, "value_mse_loss_layer_002": 0.003876, "value_mse_loss_layer_003": 0.007111, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.006317, "value_mse_loss_layer_006": 0.008606, "value_mse_loss_layer_007": 0.008789, "value_mse_loss_layer_008": 0.010803, "value_mse_loss_layer_009": 0.015381, "value_mse_loss_layer_010": 0.012329, "value_mse_loss_layer_011": 0.012878, "value_mse_loss_layer_012": 0.014282, "value_mse_loss_layer_013": 0.015442, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.018188, "value_mse_loss_layer_016": 0.014648, "value_mse_loss_layer_017": 0.017822, "value_mse_loss_layer_018": 0.015381, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019531, "value_mse_loss_layer_021": 0.022095, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.025269, "value_mse_loss_layer_024": 0.027466, "value_mse_loss_layer_025": 0.032959, "value_mse_loss_layer_026": 0.029541, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.042236, "value_mse_loss_layer_029": 0.048828, "value_mse_loss_layer_030": 0.053711, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 1.6e-05, "vq_loss_layer_004": 5.8e-05, "vq_loss_layer_005": 6.2e-05, "vq_loss_layer_006": 0.000126, "vq_loss_layer_007": 0.000158, "vq_loss_layer_008": 0.00016, "vq_loss_layer_009": 0.000233, "vq_loss_layer_010": 0.000183, "vq_loss_layer_011": 0.0002, "vq_loss_layer_012": 0.000334, "vq_loss_layer_013": 0.000292, "vq_loss_layer_014": 0.000357, "vq_loss_layer_015": 0.000395, "vq_loss_layer_016": 0.000334, "vq_loss_layer_017": 0.000298, "vq_loss_layer_018": 0.00018, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000294, "vq_loss_layer_022": 0.000252, "vq_loss_layer_023": 0.000238, "vq_loss_layer_024": 0.000252, "vq_loss_layer_025": 0.000319, "vq_loss_layer_026": 0.000422, "vq_loss_layer_027": 0.000439, "vq_loss_layer_028": 0.000698, "vq_loss_layer_029": 0.000751, "vq_loss_layer_030": 0.001892, "vq_loss_layer_031": 0.002701 }, { "ce_loss": 2.33199, "epoch": 0.01997, "grad_norm": 0.0010477195028215647, "key_mse_loss_layer_000": 0.002792, "key_mse_loss_layer_001": 0.009888, "key_mse_loss_layer_002": 0.053467, "key_mse_loss_layer_003": 0.047852, "key_mse_loss_layer_004": 0.048584, "key_mse_loss_layer_005": 0.059082, "key_mse_loss_layer_006": 0.064941, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084961, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.100586, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.117676, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.102539, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098145, "key_mse_loss_layer_018": 0.103516, "key_mse_loss_layer_019": 0.087402, "key_mse_loss_layer_020": 0.097168, "key_mse_loss_layer_021": 0.091797, "key_mse_loss_layer_022": 0.093262, "key_mse_loss_layer_023": 0.09082, "key_mse_loss_layer_024": 0.071777, "key_mse_loss_layer_025": 0.069336, "key_mse_loss_layer_026": 0.080078, "key_mse_loss_layer_027": 0.077148, "key_mse_loss_layer_028": 0.085449, "key_mse_loss_layer_029": 0.080078, "key_mse_loss_layer_030": 0.083008, "key_mse_loss_layer_031": 0.065918, "kv_mse_loss": 0.049255, "kv_vq_loss": 0.000379, "learning_rate": 0.001, "loss": 0.049637, "step": 19970, "value_mse_loss_layer_000": 0.000341, "value_mse_loss_layer_001": 0.000946, "value_mse_loss_layer_002": 0.003769, "value_mse_loss_layer_003": 0.007324, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006348, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008484, "value_mse_loss_layer_008": 0.01062, "value_mse_loss_layer_009": 0.014526, "value_mse_loss_layer_010": 0.01178, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.01355, "value_mse_loss_layer_013": 0.015076, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017456, "value_mse_loss_layer_016": 0.014038, "value_mse_loss_layer_017": 0.0177, "value_mse_loss_layer_018": 0.014954, "value_mse_loss_layer_019": 0.017822, "value_mse_loss_layer_020": 0.019409, "value_mse_loss_layer_021": 0.021729, "value_mse_loss_layer_022": 0.021851, "value_mse_loss_layer_023": 0.024414, "value_mse_loss_layer_024": 0.026978, "value_mse_loss_layer_025": 0.031738, "value_mse_loss_layer_026": 0.028442, "value_mse_loss_layer_027": 0.036621, "value_mse_loss_layer_028": 0.041748, "value_mse_loss_layer_029": 0.046875, "value_mse_loss_layer_030": 0.051758, "value_mse_loss_layer_031": 0.04541, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 5e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.2e-05, "vq_loss_layer_004": 4.7e-05, "vq_loss_layer_005": 5.8e-05, "vq_loss_layer_006": 0.000102, "vq_loss_layer_007": 0.000147, "vq_loss_layer_008": 0.000154, "vq_loss_layer_009": 0.000181, "vq_loss_layer_010": 0.000157, "vq_loss_layer_011": 0.000185, "vq_loss_layer_012": 0.000303, "vq_loss_layer_013": 0.000259, "vq_loss_layer_014": 0.00036, "vq_loss_layer_015": 0.00033, "vq_loss_layer_016": 0.000299, "vq_loss_layer_017": 0.000324, "vq_loss_layer_018": 0.000165, "vq_loss_layer_019": 0.00015, "vq_loss_layer_020": 0.000179, "vq_loss_layer_021": 0.000307, "vq_loss_layer_022": 0.000221, "vq_loss_layer_023": 0.000233, "vq_loss_layer_024": 0.000222, "vq_loss_layer_025": 0.000271, "vq_loss_layer_026": 0.000408, "vq_loss_layer_027": 0.000433, "vq_loss_layer_028": 0.000702, "vq_loss_layer_029": 0.00069, "vq_loss_layer_030": 0.001503, "vq_loss_layer_031": 0.002548 }, { "ce_loss": 2.358292, "epoch": 0.01998, "grad_norm": 0.0011689856182783842, "key_mse_loss_layer_000": 0.003098, "key_mse_loss_layer_001": 0.009949, "key_mse_loss_layer_002": 0.05249, "key_mse_loss_layer_003": 0.045898, "key_mse_loss_layer_004": 0.047363, "key_mse_loss_layer_005": 0.058594, "key_mse_loss_layer_006": 0.064453, "key_mse_loss_layer_007": 0.074707, "key_mse_loss_layer_008": 0.083496, "key_mse_loss_layer_009": 0.086914, "key_mse_loss_layer_010": 0.098633, "key_mse_loss_layer_011": 0.098145, "key_mse_loss_layer_012": 0.072266, "key_mse_loss_layer_013": 0.116699, "key_mse_loss_layer_014": 0.112305, "key_mse_loss_layer_015": 0.098633, "key_mse_loss_layer_016": 0.092285, "key_mse_loss_layer_017": 0.095703, "key_mse_loss_layer_018": 0.100586, "key_mse_loss_layer_019": 0.083984, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.089844, "key_mse_loss_layer_023": 0.086426, "key_mse_loss_layer_024": 0.068848, "key_mse_loss_layer_025": 0.065918, "key_mse_loss_layer_026": 0.07666, "key_mse_loss_layer_027": 0.075195, "key_mse_loss_layer_028": 0.082031, "key_mse_loss_layer_029": 0.078125, "key_mse_loss_layer_030": 0.080566, "key_mse_loss_layer_031": 0.064453, "kv_mse_loss": 0.049301, "kv_vq_loss": 0.000374, "learning_rate": 0.001, "loss": 0.04967, "step": 19980, "value_mse_loss_layer_000": 0.000349, "value_mse_loss_layer_001": 0.000954, "value_mse_loss_layer_002": 0.003754, "value_mse_loss_layer_003": 0.007294, "value_mse_loss_layer_004": 0.006714, "value_mse_loss_layer_005": 0.006287, "value_mse_loss_layer_006": 0.008118, "value_mse_loss_layer_007": 0.008667, "value_mse_loss_layer_008": 0.010559, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.011902, "value_mse_loss_layer_011": 0.012573, "value_mse_loss_layer_012": 0.013611, "value_mse_loss_layer_013": 0.015198, "value_mse_loss_layer_014": 0.015747, "value_mse_loss_layer_015": 0.017822, "value_mse_loss_layer_016": 0.014282, "value_mse_loss_layer_017": 0.017578, "value_mse_loss_layer_018": 0.014954, "value_mse_loss_layer_019": 0.0177, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.021973, "value_mse_loss_layer_022": 0.021729, "value_mse_loss_layer_023": 0.024658, "value_mse_loss_layer_024": 0.027588, "value_mse_loss_layer_025": 0.032471, "value_mse_loss_layer_026": 0.029419, "value_mse_loss_layer_027": 0.037109, "value_mse_loss_layer_028": 0.041504, "value_mse_loss_layer_029": 0.049316, "value_mse_loss_layer_030": 0.054199, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 4e-06, "vq_loss_layer_003": 2.1e-05, "vq_loss_layer_004": 4.6e-05, "vq_loss_layer_005": 5.2e-05, "vq_loss_layer_006": 9.8e-05, "vq_loss_layer_007": 0.000151, "vq_loss_layer_008": 0.00015, "vq_loss_layer_009": 0.000197, "vq_loss_layer_010": 0.00017, "vq_loss_layer_011": 0.000183, "vq_loss_layer_012": 0.000299, "vq_loss_layer_013": 0.000252, "vq_loss_layer_014": 0.00032, "vq_loss_layer_015": 0.000355, "vq_loss_layer_016": 0.000292, "vq_loss_layer_017": 0.000311, "vq_loss_layer_018": 0.000163, "vq_loss_layer_019": 0.000134, "vq_loss_layer_020": 0.000175, "vq_loss_layer_021": 0.000309, "vq_loss_layer_022": 0.000208, "vq_loss_layer_023": 0.000241, "vq_loss_layer_024": 0.000263, "vq_loss_layer_025": 0.000322, "vq_loss_layer_026": 0.000444, "vq_loss_layer_027": 0.000463, "vq_loss_layer_028": 0.00071, "vq_loss_layer_029": 0.000919, "vq_loss_layer_030": 0.001953, "vq_loss_layer_031": 0.002914 }, { "ce_loss": 2.30657, "epoch": 0.01999, "grad_norm": 0.0012126168003305793, "key_mse_loss_layer_000": 0.003372, "key_mse_loss_layer_001": 0.010254, "key_mse_loss_layer_002": 0.054199, "key_mse_loss_layer_003": 0.048096, "key_mse_loss_layer_004": 0.046875, "key_mse_loss_layer_005": 0.05957, "key_mse_loss_layer_006": 0.065918, "key_mse_loss_layer_007": 0.075195, "key_mse_loss_layer_008": 0.084473, "key_mse_loss_layer_009": 0.087402, "key_mse_loss_layer_010": 0.100098, "key_mse_loss_layer_011": 0.099121, "key_mse_loss_layer_012": 0.072754, "key_mse_loss_layer_013": 0.113281, "key_mse_loss_layer_014": 0.110352, "key_mse_loss_layer_015": 0.098145, "key_mse_loss_layer_016": 0.090332, "key_mse_loss_layer_017": 0.094238, "key_mse_loss_layer_018": 0.101074, "key_mse_loss_layer_019": 0.085938, "key_mse_loss_layer_020": 0.092773, "key_mse_loss_layer_021": 0.089355, "key_mse_loss_layer_022": 0.091309, "key_mse_loss_layer_023": 0.089844, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.068848, "key_mse_loss_layer_026": 0.079102, "key_mse_loss_layer_027": 0.080078, "key_mse_loss_layer_028": 0.086426, "key_mse_loss_layer_029": 0.084473, "key_mse_loss_layer_030": 0.089844, "key_mse_loss_layer_031": 0.076172, "kv_mse_loss": 0.049435, "kv_vq_loss": 0.000395, "learning_rate": 0.001, "loss": 0.049841, "step": 19990, "value_mse_loss_layer_000": 0.000353, "value_mse_loss_layer_001": 0.000984, "value_mse_loss_layer_002": 0.003967, "value_mse_loss_layer_003": 0.007446, "value_mse_loss_layer_004": 0.006989, "value_mse_loss_layer_005": 0.0065, "value_mse_loss_layer_006": 0.008362, "value_mse_loss_layer_007": 0.008545, "value_mse_loss_layer_008": 0.010986, "value_mse_loss_layer_009": 0.014771, "value_mse_loss_layer_010": 0.011658, "value_mse_loss_layer_011": 0.01239, "value_mse_loss_layer_012": 0.013733, "value_mse_loss_layer_013": 0.014954, "value_mse_loss_layer_014": 0.015991, "value_mse_loss_layer_015": 0.017578, "value_mse_loss_layer_016": 0.014221, "value_mse_loss_layer_017": 0.017456, "value_mse_loss_layer_018": 0.015869, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.019653, "value_mse_loss_layer_021": 0.022217, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.029907, "value_mse_loss_layer_025": 0.033203, "value_mse_loss_layer_026": 0.031006, "value_mse_loss_layer_027": 0.039551, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.051514, "value_mse_loss_layer_030": 0.057129, "value_mse_loss_layer_031": 0.046143, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 4e-06, "vq_loss_layer_002": 6e-06, "vq_loss_layer_003": 1.5e-05, "vq_loss_layer_004": 4.3e-05, "vq_loss_layer_005": 5.5e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.000136, "vq_loss_layer_008": 0.000159, "vq_loss_layer_009": 0.000217, "vq_loss_layer_010": 0.000165, "vq_loss_layer_011": 0.000177, "vq_loss_layer_012": 0.000298, "vq_loss_layer_013": 0.000254, "vq_loss_layer_014": 0.00034, "vq_loss_layer_015": 0.00034, "vq_loss_layer_016": 0.000307, "vq_loss_layer_017": 0.00028, "vq_loss_layer_018": 0.000185, "vq_loss_layer_019": 0.000165, "vq_loss_layer_020": 0.000164, "vq_loss_layer_021": 0.000269, "vq_loss_layer_022": 0.00022, "vq_loss_layer_023": 0.000248, "vq_loss_layer_024": 0.000261, "vq_loss_layer_025": 0.000298, "vq_loss_layer_026": 0.000467, "vq_loss_layer_027": 0.000523, "vq_loss_layer_028": 0.000809, "vq_loss_layer_029": 0.001244, "vq_loss_layer_030": 0.002365, "vq_loss_layer_031": 0.00354 }, { "ce_loss": 2.318641, "epoch": 0.02, "grad_norm": 0.001002313569188118, "key_mse_loss_layer_000": 0.003113, "key_mse_loss_layer_001": 0.01001, "key_mse_loss_layer_002": 0.054688, "key_mse_loss_layer_003": 0.049561, "key_mse_loss_layer_004": 0.054688, "key_mse_loss_layer_005": 0.061523, "key_mse_loss_layer_006": 0.067871, "key_mse_loss_layer_007": 0.077148, "key_mse_loss_layer_008": 0.085938, "key_mse_loss_layer_009": 0.089355, "key_mse_loss_layer_010": 0.101562, "key_mse_loss_layer_011": 0.099609, "key_mse_loss_layer_012": 0.073242, "key_mse_loss_layer_013": 0.117188, "key_mse_loss_layer_014": 0.114258, "key_mse_loss_layer_015": 0.103027, "key_mse_loss_layer_016": 0.095215, "key_mse_loss_layer_017": 0.098633, "key_mse_loss_layer_018": 0.104004, "key_mse_loss_layer_019": 0.086914, "key_mse_loss_layer_020": 0.09668, "key_mse_loss_layer_021": 0.092773, "key_mse_loss_layer_022": 0.09375, "key_mse_loss_layer_023": 0.091309, "key_mse_loss_layer_024": 0.072754, "key_mse_loss_layer_025": 0.070312, "key_mse_loss_layer_026": 0.081055, "key_mse_loss_layer_027": 0.07959, "key_mse_loss_layer_028": 0.087402, "key_mse_loss_layer_029": 0.08252, "key_mse_loss_layer_030": 0.086426, "key_mse_loss_layer_031": 0.069336, "kv_mse_loss": 0.049316, "kv_vq_loss": 0.000384, "learning_rate": 0.001, "loss": 0.049695, "step": 20000, "value_mse_loss_layer_000": 0.000357, "value_mse_loss_layer_001": 0.000973, "value_mse_loss_layer_002": 0.003906, "value_mse_loss_layer_003": 0.007111, "value_mse_loss_layer_004": 0.006561, "value_mse_loss_layer_005": 0.006165, "value_mse_loss_layer_006": 0.008179, "value_mse_loss_layer_007": 0.008606, "value_mse_loss_layer_008": 0.010925, "value_mse_loss_layer_009": 0.014954, "value_mse_loss_layer_010": 0.012085, "value_mse_loss_layer_011": 0.012634, "value_mse_loss_layer_012": 0.013855, "value_mse_loss_layer_013": 0.015259, "value_mse_loss_layer_014": 0.016235, "value_mse_loss_layer_015": 0.018311, "value_mse_loss_layer_016": 0.014832, "value_mse_loss_layer_017": 0.018188, "value_mse_loss_layer_018": 0.015991, "value_mse_loss_layer_019": 0.018433, "value_mse_loss_layer_020": 0.02063, "value_mse_loss_layer_021": 0.022949, "value_mse_loss_layer_022": 0.023315, "value_mse_loss_layer_023": 0.026733, "value_mse_loss_layer_024": 0.028931, "value_mse_loss_layer_025": 0.033936, "value_mse_loss_layer_026": 0.03125, "value_mse_loss_layer_027": 0.039307, "value_mse_loss_layer_028": 0.044434, "value_mse_loss_layer_029": 0.050781, "value_mse_loss_layer_030": 0.055176, "value_mse_loss_layer_031": 0.045898, "vq_loss_layer_000": 4e-06, "vq_loss_layer_001": 2e-06, "vq_loss_layer_002": 3e-06, "vq_loss_layer_003": 1.4e-05, "vq_loss_layer_004": 4.5e-05, "vq_loss_layer_005": 5.3e-05, "vq_loss_layer_006": 0.000101, "vq_loss_layer_007": 0.00016, "vq_loss_layer_008": 0.000157, "vq_loss_layer_009": 0.000213, "vq_loss_layer_010": 0.000172, "vq_loss_layer_011": 0.000182, "vq_loss_layer_012": 0.000311, "vq_loss_layer_013": 0.000277, "vq_loss_layer_014": 0.000355, "vq_loss_layer_015": 0.000347, "vq_loss_layer_016": 0.000324, "vq_loss_layer_017": 0.000288, "vq_loss_layer_018": 0.000196, "vq_loss_layer_019": 0.000135, "vq_loss_layer_020": 0.000192, "vq_loss_layer_021": 0.000299, "vq_loss_layer_022": 0.000206, "vq_loss_layer_023": 0.000244, "vq_loss_layer_024": 0.000228, "vq_loss_layer_025": 0.000246, "vq_loss_layer_026": 0.000404, "vq_loss_layer_027": 0.000456, "vq_loss_layer_028": 0.000599, "vq_loss_layer_029": 0.00071, "vq_loss_layer_030": 0.001465, "vq_loss_layer_031": 0.002243 } ], "logging_steps": 10, "max_steps": 1000000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.482938741705933e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }